In [1]:
from helpers import load_dataset, preprocessing_clicks

import pandas as pd
import numpy as np
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import store_metadata
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [2]:
TOP_K = 10
EPOCHS = 50
BATCH_SIZE = 1024
SEED = DEFAULT_SEED

In [70]:
# Load datasets
df_articles, df_clicks, article_embeddings = load_dataset()

# Preprocessing articles
df_clicks = preprocessing_clicks(df_clicks)

In [4]:
# Split clicks dataset into training and test sets
df_clicks = df_clicks.sample(n=100000, random_state=SEED)
df_clicks.rename(columns={'user_id': 'userID', "click_article_id":"itemID"}, inplace=True)
df_clicks["itemID"] = df_clicks["itemID"].astype('int64')
df_clicks['rating'] = 1.0

train, test = python_stratified_split(df_clicks, ratio=0.8, col_user="userID", col_item="itemID", seed=SEED)

In [5]:
train.head()

Unnamed: 0,userID,session_id,session_size,itemID,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,session_start_dt,click_timestamp_dt,click_hour,click_dayofweek,rating
858202,0,1506825423271737,2,157541,4,3,20,1,20,2,2017-10-01 02:37:03,2017-10-01 03:00:28.020,3,6,1.0
4700,1,1507820499195442,2,183176,4,1,17,1,17,2,2017-10-12 15:01:39,2017-10-12 16:33:31.163,16,3,1.0
858208,3,1506825442704740,2,236065,4,3,2,1,21,1,2017-10-01 02:37:22,2017-10-01 03:12:16.942,3,6,1.0
856703,4,1507605254275655,2,271261,4,1,17,1,16,2,2017-10-10 03:14:14,2017-10-10 03:15:20.003,3,1,1.0
2023195,5,1507633524270796,7,348091,4,1,12,1,25,1,2017-10-10 11:05:24,2017-10-10 11:30:02.122,11,1,1.0


In [6]:
train['old_index'] = train.index
train.reset_index(drop=True, inplace=True)


In [7]:
test['old_index'] = test.index
test.reset_index(drop=True, inplace=True)

In [8]:
# data.test[:5]

In [9]:
test[["userID", "itemID","old_index"]][:5]

Unnamed: 0,userID,itemID,old_index
0,16,162286,858239
1,24,225010,232034
2,32,234698,1700422
3,48,271859,1577270
4,51,235689,988440


In [10]:
data = ImplicitCF(train=train, test=test, seed=SEED, col_user="userID", col_item="itemID")

In [11]:
# Create userID and itemID mappings for train
user_mapping_train = dict(zip(train['userID'], data.train['userID']))
item_reverse_mapping_train = dict(zip(data.train['itemID'], train['itemID']))

# Create userID and itemID mappings for test
user_mapping_test = dict(zip(test['userID'], data.test['userID']))
item_reverse_mapping_test = dict(zip(data.test['itemID'], test['itemID']))

# Combine userID mappings
user_mapping = {**user_mapping_train, **user_mapping_test}
# Combine itemID mappings (we reverse both train and test item mappings)
item_reverse_mapping = {**item_reverse_mapping_train, **item_reverse_mapping_test}

# Example usage
user_id_example = 10
item_id_example = 0

print(f"UserID {user_id_example} corresponds to UserID {user_mapping[user_id_example]}")
print(f"ItemID {item_id_example} corresponds to ItemID {item_reverse_mapping[item_id_example]}")

user_id_example = 2151
item_id_example = 2559

print(f"UserID {user_id_example} corresponds to UserID {user_mapping[user_id_example]}")
print(f"ItemID {item_id_example} corresponds to ItemID {item_reverse_mapping[item_id_example]}")

UserID 10 corresponds to UserID 7
ItemID 0 corresponds to ItemID 157541
UserID 2151 corresponds to UserID 902
ItemID 2559 corresponds to ItemID 203397


In [12]:
yaml_file = "input/lightgcn.yaml"
# Hyperparameters configuration
hparams = prepare_hparams(
    yaml_file,
    n_layers=3,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    learning_rate=0.005,
    eval_epoch=5,
    top_k=TOP_K,
    save_model=True,
    save_epoch=50,
    MODEL_DIR="./input/models/"
)

In [13]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2024-05-27 13:04:23.856292: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:375] MLIR V1 optimization pass is not enabled


In [14]:
# Train the model
with Timer() as train_time:
    model.fit()
print("Took {} seconds for training.".format(train_time.interval))


Epoch 1 (train)6.8s: train loss = 0.32808 = (mf)0.32794 + (embed)0.00014
Epoch 2 (train)6.7s: train loss = 0.07398 = (mf)0.07359 + (embed)0.00039
Epoch 3 (train)6.8s: train loss = 0.03749 = (mf)0.03697 + (embed)0.00052
Epoch 4 (train)6.7s: train loss = 0.02241 = (mf)0.02180 + (embed)0.00061
Epoch 5 (train)6.6s + (eval)1.0s: train loss = 0.01515 = (mf)0.01447 + (embed)0.00068, recall = 0.03337, ndcg = 0.01655, precision = 0.00342, map = 0.01142
Epoch 6 (train)6.6s: train loss = 0.01069 = (mf)0.00996 + (embed)0.00073
Epoch 7 (train)6.7s: train loss = 0.00823 = (mf)0.00746 + (embed)0.00077
Epoch 8 (train)6.6s: train loss = 0.00668 = (mf)0.00588 + (embed)0.00080
Epoch 9 (train)6.7s: train loss = 0.00561 = (mf)0.00478 + (embed)0.00082
Epoch 10 (train)7.2s + (eval)1.0s: train loss = 0.00462 = (mf)0.00378 + (embed)0.00084, recall = 0.03112, ndcg = 0.01525, precision = 0.00316, map = 0.01045
Epoch 11 (train)7.0s: train loss = 0.00407 = (mf)0.00321 + (embed)0.00086
Epoch 12 (train)7.0s: train l

In [15]:
unique_item_counts = df_clicks['itemID'].nunique()

In [16]:
unique_item_counts

8974

In [17]:
topk_scores = model.recommend_k_items(df_clicks, top_k=5, remove_seen=False)
topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,64700,124194,29.942514
1,64700,107305,10.797299
2,64700,181322,10.444427
3,64700,87235,9.414076
4,64700,234269,9.075692


In [18]:
topk_scores[topk_scores["userID"] == 19573]

Unnamed: 0,userID,itemID,prediction
49155,19573,20691,10.876348
49156,19573,87228,10.498782
49157,19573,237822,5.286204
49158,19573,283009,4.912339
49159,19573,336220,4.689189


In [19]:
topk_scores["itemID"].dtype

dtype('int64')

In [20]:
# Evaluate the model
eval_map = map(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')


MAP:	0.004687
NDCG:	0.006536
Precision@K:	0.001236
Recall@K:	0.012247


In [21]:

# print(f"UserID {user_id_example} corresponds to UserID {user_mapping[user_id_example]}")
# print(f"ItemID {item_id_example} corresponds to ItemID {item_reverse_mapping[item_id_example]}")

In [58]:
def get_user_article_scores(originUserID, model, data):
    userID = user_mapping[originUserID]
    try:
        # Check if the user exists in the dataset
        if userID not in data.train['userID'].values and userID not in data.test['userID'].values:
            print(f"User ID {userID} not found in the training or testing set.")
            return pd.DataFrame(columns=['user_id', 'article_id', 'score'])
        
        # Prepare a DataFrame for the specific user to get recommendations
        user_df = pd.DataFrame({'userID': [userID] * data.n_items, 'itemID': range(data.n_items)})
        
        # Use the model to score all items for the user
        full_scores = model.recommend_k_items(user_df, top_k=data.n_items, remove_seen=False)
        full_scores["userID"] = originUserID
        full_scores['prediction'] = (full_scores['prediction'] - full_scores['prediction'].min()) / (full_scores['prediction'].max() - full_scores['prediction'].min())

        # # Extract item IDs and their scores
        full_scores.rename(columns={'userID': 'user_id', "itemID":"article_id", "prediction":"score"}, inplace=True)

        return full_scores
        
    except Exception as e:
        # print(f"An error occurred: {e}")
        return pd.DataFrame(columns=['user_id', 'article_id', 'score'])

In [31]:
train.value_counts()

userID  session_id        session_size  itemID  click_environment  click_deviceGroup  click_os  click_country  click_region  click_referrer_type  session_start_dt     click_timestamp_dt       click_hour  click_dayofweek  rating  old_index
0       1506825423271737  2             157541  4                  3                  20        1              20            2                    2017-10-01 02:37:03  2017-10-01 03:00:28.020  3           6                1.0     858202       1
139006  1507214501125885  4             95834   4                  1                  17        1              9             1                    2017-10-05 14:41:41  2017-10-05 14:46:46.022  14          3                1.0     2186965      1
139002  1507054352206334  3             199197  4                  1                  17        1              25            2                    2017-10-03 18:12:32  2017-10-03 18:15:00.095  18          1                1.0     213074       1
138992  1507668743217123  4  

In [59]:
train[train["userID"] == 55694]

Unnamed: 0,userID,session_id,session_size,itemID,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,session_start_dt,click_timestamp_dt,click_hour,click_dayofweek,rating,old_index
31247,55694,1506951414314759,6,284844,4,1,17,1,25,1,2017-10-02 13:36:54,2017-10-02 14:05:29.332,14,0,1.0,1521463
31248,55694,1507126075236996,8,198613,4,1,17,1,25,1,2017-10-04 14:07:55,2017-10-04 14:29:30.415,14,2,1.0,2259362
31249,55694,1506946605609696,4,272660,4,1,17,1,25,2,2017-10-02 12:16:45,2017-10-02 12:31:06.784,12,0,1.0,1906605
31250,55694,1508165404422562,4,202381,4,1,17,1,25,2,2017-10-16 14:50:04,2017-10-16 14:50:23.967,14,0,1.0,1033734
31251,55694,1508159611363935,5,331564,4,1,17,1,25,1,2017-10-16 13:13:31,2017-10-16 13:29:21.717,13,0,1.0,1184323


In [60]:
data.train[data.train["userID"] == 20320]

Unnamed: 0,userID,itemID,rating
31247,20320,107,1.0
31248,20320,1507,1.0
31249,20320,97,1.0
31250,20320,810,1.0
31251,20320,764,1.0


In [54]:
# full_scores[full_scores["article_id"] == 31836]

In [61]:
userID = 163
full_scores = get_user_article_scores(userID, model, data)
full_scores

Unnamed: 0,user_id,article_id,score
0,163,338340,1.000000
1,163,119592,0.962592
2,163,293301,0.723339
3,163,225010,0.694668
4,163,235230,0.662620
...,...,...,...
8969,163,206315,0.097610
8970,163,303180,0.077740
8971,163,237524,0.070577
8972,163,236444,0.053712


In [62]:
full_scores

Unnamed: 0,user_id,article_id,score
0,163,338340,1.000000
1,163,119592,0.962592
2,163,293301,0.723339
3,163,225010,0.694668
4,163,235230,0.662620
...,...,...,...
8969,163,206315,0.097610
8970,163,303180,0.077740
8971,163,237524,0.070577
8972,163,236444,0.053712


In [None]:
data.train

In [63]:
userIDs = df_clicks['userID'].unique()
userIDs.shape

(71185,)

In [69]:
from tqdm import tqdm

result_list = []
for userID in tqdm(userIDs):
    scores = get_user_article_scores(userID, model, data)
    result_list.append(scores)

result_df = pd.concat(result_list, ignore_index=True)

  1%|          | 386/71185 [00:05<15:44, 74.95it/s] 


KeyboardInterrupt: 

In [68]:
result_df

Unnamed: 0,user_id,article_id,score
0,151044,141004,1.000000
1,151044,338350,0.732868
2,151044,285261,0.709908
3,151044,95716,0.709162
4,151044,123909,0.665377
...,...,...,...
3185765,259488,156355,0.063742
3185766,259488,272218,0.063623
3185767,259488,108855,0.037537
3185768,259488,124194,0.024460


In [52]:
result_df["itemID"].isna().sum()


228980584