In [1]:
from helpers import load_dataset, preprocessing_clicks

import pandas as pd
import numpy as np
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import store_metadata
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [2]:
TOP_K = 10
EPOCHS = 50
BATCH_SIZE = 1024
SEED = DEFAULT_SEED

In [3]:
# Load datasets
df_articles, df_clicks, article_embeddings = load_dataset()

# Preprocessing articles
df_clicks = preprocessing_clicks(df_clicks)

In [5]:
# Split clicks dataset into training and test sets
df_clicks = df_clicks.sample(n=10000, random_state=SEED)
df_clicks.rename(columns={'user_id': 'userID', "click_article_id":"itemID"}, inplace=True)
df_clicks["itemID"] = df_clicks["itemID"].astype('int64')
df_clicks['rating'] = 1.0

train, test = python_stratified_split(df_clicks, ratio=0.8, col_user="userID", col_item="itemID", seed=SEED)

In [31]:
train.head()

Unnamed: 0,userID,session_id,session_size,itemID,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,session_start_dt,click_timestamp_dt,click_hour,click_dayofweek,rating
89128,10,1507917672187324,14,199207,4,1,17,1,18,1,2017-10-13 18:01:12,2017-10-13 20:01:48.504,20,4,1.0
232034,24,1507909485182712,4,225010,4,1,17,1,25,1,2017-10-13 15:44:45,2017-10-13 15:45:23.273,15,4,1.0
2565312,24,1507855797268825,3,156381,4,1,17,1,25,1,2017-10-13 00:49:57,2017-10-13 00:58:48.496,0,4,1.0
1493296,26,1507629776306390,9,31488,4,3,2,1,13,1,2017-10-10 10:02:56,2017-10-10 10:10:53.711,10,1,1.0
858287,32,1506825956252769,3,235840,4,3,2,1,25,2,2017-10-01 02:45:56,2017-10-01 03:11:33.776,3,6,1.0


In [32]:
train['old_index'] = train.index
train.reset_index(drop=True, inplace=True)

In [29]:
data.test

Unnamed: 0,userID,itemID,rating
0,156,2559,1.0
1,336,1044,1.0
2,374,2560,1.0
3,400,2561,1.0
4,420,137,1.0
5,437,820,1.0
6,471,332,1.0
7,481,103,1.0
8,489,134,1.0
9,852,1930,1.0


In [38]:
train[["userID", "itemID","old_index"]]

Unnamed: 0,userID,itemID,old_index
0,10,199207,89128
1,24,225010,232034
2,24,156381,2565312
3,26,31488,1493296
4,32,235840,858287
...,...,...,...
9949,322151,277107,1599872
9950,322326,331116,1402380
9951,322466,30760,1404883
9952,322490,42876,1405206


In [42]:
# Create userID and itemID mappings for train
user_mapping_train = dict(zip(train['userID'], data.train['userID']))
item_reverse_mapping_train = dict(zip(data.train['itemID'], train['itemID']))

# Create userID and itemID mappings for test
user_mapping_test = dict(zip(test['userID'], data.test['userID']))
item_reverse_mapping_test = dict(zip(data.test['itemID'], test['itemID']))

# Combine userID mappings
user_mapping = {**user_mapping_train, **user_mapping_test}
# Combine itemID mappings (we reverse both train and test item mappings)
item_reverse_mapping = {**item_reverse_mapping_train, **item_reverse_mapping_test}

In [43]:
# Example usage
user_id_example = 10
item_id_example = 0

print(f"UserID {user_id_example} corresponds to UserID {user_mapping[user_id_example]}")
print(f"ItemID {item_id_example} corresponds to ItemID {item_reverse_mapping[item_id_example]}")

user_id_example = 11
item_id_example = 4

print(f"UserID {user_id_example} corresponds to UserID {user_mapping[user_id_example]}")
print(f"ItemID {item_id_example} corresponds to ItemID {item_reverse_mapping[item_id_example]}")

UserID 10 corresponds to UserID 0
ItemID 0 corresponds to ItemID 199207


KeyError: 11

In [24]:
data.train[data.train["itemID"]== 0].head()

Unnamed: 0,userID,itemID,rating
0,0,0,1.0
4894,4549,0,1.0


In [34]:
train[train["itemID"]== 199207].head()

Unnamed: 0,userID,session_id,session_size,itemID,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,session_start_dt,click_timestamp_dt,click_hour,click_dayofweek,rating,old_index
0,10,1507917672187324,14,199207,4,1,17,1,18,1,2017-10-13 18:01:12,2017-10-13 20:01:48.504,20,4,1.0,89128
4894,83621,1507911495150449,12,199207,4,1,17,1,25,1,2017-10-13 16:18:15,2017-10-13 17:15:02.169,17,4,1.0,239802


In [9]:
data = ImplicitCF(train=train, test=test, seed=SEED, col_user="userID", col_item="itemID")

In [10]:
yaml_file = "input/lightgcn.yaml"
# Hyperparameters configuration
hparams = prepare_hparams(
    yaml_file,
    n_layers=3,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    learning_rate=0.005,
    eval_epoch=5,
    top_k=TOP_K,
    save_model=True,
    save_epoch=50,
    MODEL_DIR="./input/models/"
)

In [11]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2024-05-27 10:47:22.902573: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:375] MLIR V1 optimization pass is not enabled


In [12]:
# Train the model
with Timer() as train_time:
    model.fit()
print("Took {} seconds for training.".format(train_time.interval))


Epoch 1 (train)0.2s: train loss = 0.67151 = (mf)0.67150 + (embed)0.00001
Epoch 2 (train)0.1s: train loss = 0.56924 = (mf)0.56921 + (embed)0.00003
Epoch 3 (train)0.1s: train loss = 0.41426 = (mf)0.41419 + (embed)0.00007
Epoch 4 (train)0.1s: train loss = 0.28265 = (mf)0.28251 + (embed)0.00014
Epoch 5 (train)0.1s + (eval)0.0s: train loss = 0.20666 = (mf)0.20646 + (embed)0.00020, recall = 0.02174, ndcg = 0.00936, precision = 0.00217, map = 0.00543
Epoch 6 (train)0.1s: train loss = 0.15322 = (mf)0.15296 + (embed)0.00026
Epoch 7 (train)0.1s: train loss = 0.12452 = (mf)0.12421 + (embed)0.00031
Epoch 8 (train)0.1s: train loss = 0.10244 = (mf)0.10208 + (embed)0.00035
Epoch 9 (train)0.1s: train loss = 0.08353 = (mf)0.08314 + (embed)0.00039
Epoch 10 (train)0.1s + (eval)0.0s: train loss = 0.07010 = (mf)0.06968 + (embed)0.00042, recall = 0.02174, ndcg = 0.00725, precision = 0.00217, map = 0.00311
Epoch 11 (train)0.1s: train loss = 0.05787 = (mf)0.05742 + (embed)0.00045
Epoch 12 (train)0.1s: train l

In [13]:
unique_item_counts = df_clicks['itemID'].nunique()

In [14]:
unique_item_counts

2571

In [15]:
topk_scores = model.recommend_k_items(df_clicks, top_k=5, remove_seen=False)
topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,206922,162655,8.635474
1,206922,293301,5.617226
2,206922,288870,5.546084
3,206922,199198,4.722063
4,206922,272143,4.19972


In [16]:
topk_scores[topk_scores["userID"] == 19573]

Unnamed: 0,userID,itemID,prediction


In [17]:
topk_scores["itemID"].dtype

dtype('int64')

In [18]:
# Evaluate the model
eval_map = map(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')


MAP:	0.000000
NDCG:	0.000000
Precision@K:	0.000000
Recall@K:	0.000000


In [19]:
def get_user_article_scores(userID, model, data):
    try:
        # Check if the user exists in the dataset
        if userID not in data.train['userID'].values and userID not in data.test['userID'].values:
            print(f"User ID {userID} not found in the training or testing set.")
            return pd.DataFrame(columns=['userID', 'itemID', 'score'])
        
        # Prepare a DataFrame for the specific user to get recommendations
        user_df = pd.DataFrame({'userID': [userID] * data.n_items, 'itemID': range(data.n_items)})
        
        # Use the model to score all items for the user
        full_scores = model.recommend_k_items(user_df, top_k=data.n_items, remove_seen=False)
        full_scores['prediction'] = (full_scores['prediction'] - full_scores['prediction'].min()) / (full_scores['prediction'].max() - full_scores['prediction'].min())

        # # Extract item IDs and their scores
        full_scores.rename(columns={'user_id': 'user_id', "itemID":"article_id", "prediction":"score"}, inplace=True)

        return full_scores
        
    except Exception as e:
        # print(f"An error occurred: {e}")
        return pd.DataFrame(columns=['userID', 'itemID', 'score'])

In [20]:
userID = 322151
full_scores = get_user_article_scores(userID, model, data)

User ID 322151 not found in the training or testing set.


In [21]:
full_scores

Unnamed: 0,userID,itemID,score


In [22]:
data.train

Unnamed: 0,userID,itemID,rating
0,0,0,1.0
1,1,1,1.0
2,1,2,1.0
3,2,3,1.0
4,3,4,1.0
...,...,...,...
9949,9489,671,1.0
9950,9490,187,1.0
9951,9491,1987,1.0
9952,9492,2558,1.0


In [None]:
userIDs = df_clicks['userID'].unique()
userIDs.shape

In [None]:
result_list = []
for userID in userIDs:
    scores = get_user_article_scores(userID, model, data)
    result_list.append(scores)

result_df = pd.concat(result_list, ignore_index=True)

In [None]:
result_df
