In [37]:
import torch
import pickle
from pathlib import Path
from recsys_slates_dataset import dataset_torch
import numpy as np
import os
import json
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD


In [26]:

current_directory = os.getcwd()
data_path = os.path.join(current_directory,"finn_data", "data.npz")


data = np.load(data_path, allow_pickle=True)


print("Available arrays:", data.files)


slate = data["slate"]
click = data["click"]
interaction_type = data["interaction_type"]
click_idx = data["click_idx"]
slate_lengths = data["slate_lengths"]


print("Slate shape:", slate.shape)
print("Click shape:", click.shape)
print("Interaction Type shape:", interaction_type.shape)
print("Click Index shape:", click_idx.shape)
print("Slate Lengths shape:", slate_lengths.shape)

Available arrays: ['userId', 'click', 'click_idx', 'slate_lengths', 'slate', 'interaction_type']
Slate shape: (2277645, 20, 25)
Click shape: (2277645, 20)
Interaction Type shape: (2277645, 20)
Click Index shape: (2277645, 20)
Slate Lengths shape: (2277645, 20)


In [38]:

click_data=click
n_users = click_data.shape[0]
n_items = 1311775  


rows = click_data.flatten()  
cols = np.repeat(np.arange(n_users), click_data.shape[1])  


print("First 10 rows (item indices, before filtering 0):")
print(rows[:10])  

mask = (rows > 0)
filtered_rows = rows[mask]
filtered_cols = cols[mask]


data = np.ones_like(filtered_rows, dtype=np.int8)
item_user_matrix = csr_matrix((data, (filtered_rows, filtered_cols)), shape=(n_items, n_users))

print(f"Sparse matrix shape: {item_user_matrix.shape}")
print(f"Non-zero interactions: {item_user_matrix.nnz}")

n_components = 100  
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_embeddings = svd.fit_transform(item_user_matrix)

print("Reduced embeddings shape:", reduced_embeddings.shape)  

First 10 rows (item indices, before filtering 0 and 1):
[ 474955  477038  923419 1043617       1  908293 1304331 1304942 1304641
 1038839]
Sparse matrix shape: (1311775, 2277645)
Non-zero interactions: 28228595
Reduced embeddings shape: (1311775, 100)


In [48]:
item_embedding_dict = {item_id: embedding for item_id, embedding in zip(range(2, n_items+1), reduced_embeddings)}

# Save the embeddings to a pickle file
base_path = Path("/home/aayush/rsys_data/rsys_2025")
output_file = base_path / "item_embeddings_finn.pkl"

# Save to a pickle file
with open(output_file, 'wb') as f:
    pickle.dump(item_embedding_dict, f)

print(f"Item embeddings saved to: {output_file}")

# Cross-check: Check that the embeddings correspond to the correct item IDs


Item embeddings saved to: /home/aayush/rsys_data/rsys_2025/item_embeddings_finn.pkl


In [56]:
output_file = "/home/aayush/rsys_data/rsys_2025/item_embeddings_finn.pkl"

with open(output_file, 'rb') as f:
    item_embedding_dict = pickle.load(f)


print(f"Loaded {len(item_embedding_dict)} item embeddings.")
print(f"Embedding for item 2: {item_embedding_dict[1311775]}")

Loaded 1311774 item embeddings.
Embedding for item 2: [ 7.88776177e-04 -1.42017029e-04 -2.00610590e-06 -3.37076006e-05
 -4.17100069e-05 -9.59805897e-05  4.17132929e-05 -3.16400546e-05
  5.23982274e-05  2.67678004e-05  3.84504504e-05 -7.47377426e-05
  9.82862782e-05  3.22708237e-05 -3.56101712e-05  1.44050368e-05
 -6.07815971e-05 -1.52183514e-05 -1.01252010e-05  4.42772465e-05
 -1.63684479e-05 -9.22245071e-05 -5.61864031e-05 -5.01701790e-05
 -4.65711973e-05  3.31952457e-05  5.67730664e-05  4.21107189e-05
  2.35502301e-04  1.13208607e-04 -2.54141941e-05 -2.24234674e-05
  3.01769902e-05  2.47645668e-05 -5.29284997e-05  4.75453199e-05
  5.15703493e-05  7.43833876e-05 -3.10585050e-05 -1.03147084e-05
 -8.14663178e-05 -9.83090785e-05 -1.01161518e-04  9.84655059e-05
 -1.28551772e-04  1.19721019e-05  8.43247017e-05 -1.13849626e-04
  2.18758605e-05 -1.44188147e-05  9.94448244e-06 -1.74191499e-04
 -5.90872375e-05 -8.15043166e-05  2.50881534e-05 -1.69280424e-04
 -1.02130951e-04 -1.32656486e-04 -5.

In [11]:
import json
import os

# Define the current directory if not already set
current_directory = os.getcwd()  # or set it to the correct path if needed

# Define the correct path to the ind2val.json file
json_path = os.path.join(current_directory, "finn_data", "ind2val.json")

# Load and inspect the ind2val.json file
with open(json_path, 'r') as f:
    ind2val = json.load(f)


print(json.dumps(ind2val, indent=2)) 

{
  "category": {
    "0": "PAD",
    "1": "noClick",
    "2": "<UNK>",
    "3": "BAP,antiques,Tr\u00f8ndelag",
    "4": "MOTOR,,Sogn og Fjordane",
    "5": "BAP,electronicsappliances,Finnmark",
    "6": "BAP,entertainmenthobbyleisure,\u00d8stfold",
    "7": "MOTOR,,Hedmark",
    "8": "BAP,antiques",
    "9": "BOAT,,M\u00f8re og Romsdal",
    "10": "BAP,animalsequipment,\u00d8stfold",
    "11": "BAP,entertainmenthobbyleisure,M\u00f8re og Romsdal",
    "12": "BAP,housegardenrenovation,\u00d8stfold",
    "13": "REAL_ESTATE,,M\u00f8re og Romsdal",
    "14": "BAP,business,Oppland",
    "15": "BAP",
    "16": "BAP,housegardenrenovation,Rogaland",
    "17": "BAP,housegardenrenovation,Vestfold",
    "18": "BAP,business,Troms",
    "19": "BAP,electronicsappliances,",
    "20": "BAP,furnitureinterior,M\u00f8re og Romsdal",
    "21": "BAP,parentschildren,M\u00f8re og Romsdal",
    "22": "BAP,sportsoutdoors,Vest-Agder",
    "23": "BAP,animalsequipment,Sogn og Fjordane",
    "24": "BAP,electronics

In [40]:
user_id = 5
interaction_num = 3

# Get the presented slate and click information
presented_slate = slate[user_id, interaction_num]
user_click = click[user_id, interaction_num]

print(f"Presented Slate for User {user_id} at Interaction {interaction_num}:", presented_slate)
print(f"User Click for User {user_id} at Interaction {interaction_num}:", user_click)


Presented Slate for User 5 at Interaction 3: [     1 638995 638947 638711 637590 637930 638894      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0]
User Click for User 5 at Interaction 3: 637590


In [41]:

json_path = os.path.join(current_directory, "data", "ind2val.json")

with open(json_path, "r") as file:
    ind2val = json.load(file)

for key, val in ind2val.items():
  print(" ")
  print(f"{key} first entries:")
  for idx, name in val.items():
    print(f"{idx}: {val[idx]}")
    

# Explore the keys and values
print("Keys in ind2val.json:", ind2val.keys())
print("Category Mapping Sample:", list(ind2val["category"].items())[:-3])
print("Interaction Type Mapping Sample:", list(ind2val["interaction_type"].items()))


 
category first entries:
0: PAD
1: noClick
2: <UNK>
3: BAP,antiques,Trøndelag
4: MOTOR,,Sogn og Fjordane
5: BAP,electronicsappliances,Finnmark
6: BAP,entertainmenthobbyleisure,Østfold
7: MOTOR,,Hedmark
8: BAP,antiques
9: BOAT,,Møre og Romsdal
10: BAP,animalsequipment,Østfold
11: BAP,entertainmenthobbyleisure,Møre og Romsdal
12: BAP,housegardenrenovation,Østfold
13: REAL_ESTATE,,Møre og Romsdal
14: BAP,business,Oppland
15: BAP
16: BAP,housegardenrenovation,Rogaland
17: BAP,housegardenrenovation,Vestfold
18: BAP,business,Troms
19: BAP,electronicsappliances,
20: BAP,furnitureinterior,Møre og Romsdal
21: BAP,parentschildren,Møre og Romsdal
22: BAP,sportsoutdoors,Vest-Agder
23: BAP,animalsequipment,Sogn og Fjordane
24: BAP,electronicsappliances,Sogn og Fjordane
25: BAP,sportsoutdoors,Nordland
26: BAP,sportsoutdoors,Rogaland
27: MOTOR,,Møre og Romsdal
28: BAP,animalsequipment,Rogaland
29: BAP,electronicsappliances,Trøndelag
30: BAP,sportsoutdoors,Buskerud
31: BAP,parentschildren,Trøndelag
3

In [19]:
itemattr_path = os.path.join(current_directory, "finn_data", "itemattr.npz")
itemattr_data = np.load(itemattr_path, allow_pickle=True)
category=itemattr_data['category']
print("Category shape:", category.shape)

print("Available arrays in itemattr.npz:", itemattr_data.files)


Category shape: (1311775,)
Available arrays in itemattr.npz: ['category']


In [24]:
category[1]

np.float64(1.0)

In [57]:
ind2val['category'].get(str(134))

'BAP,entertainmenthobbyleisure,Telemark'

In [63]:
ind2val['category'].get(str(int(category[638947])))

'REAL_ESTATE,,Oppland'

In [55]:
ind2val['category'].get(134)

In [65]:
# Get the slate for User 5 at Interaction 3
presented_slate = slate[8, 3]

# Map each index in the slate to its category string
mapped_categories = [ind2val["category"].get(str(int(category[idx])),"<UNK>") for idx in presented_slate]
print("Mapped Categories for Presented Slate:", mapped_categories)


Mapped Categories for Presented Slate: ['noClick', 'BOAT,,Akershus', '<UNK>', 'BOAT,,Vestfold', 'BOAT,,Akershus', 'BOAT,,Rogaland', 'BOAT,,Trøndelag', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [None]:
# Get the interaction type for User 5 at Interaction 3
interaction_type_idx = interaction_type[5, 8]
mapped_interaction_type = ind2val["interaction_type"].get(str(interaction_type_idx), "<UNK>")
print("Interaction Type:", mapped_interaction_type)


Interaction Type: rec


In [23]:
print("Sample Keys in Category Mapping:", list(ind2val["category"].keys())[:10])
print("Total Mappings in Category:", len(ind2val["category"]))


Sample Keys in Category Mapping: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Total Mappings in Category: 290


In [25]:
unique_indices = np.unique(slate)
valid_categories = [ind2val["category"].get(str(idx)) for idx in unique_indices if str(idx) in ind2val["category"]]
print("Mapped Valid Categories:", valid_categories[:10])
print("Total Valid Mapped Categories:", len(valid_categories))


Mapped Valid Categories: ['PAD', 'noClick', '<UNK>', 'BAP,antiques,Trøndelag', 'MOTOR,,Sogn og Fjordane', 'BAP,electronicsappliances,Finnmark', 'BAP,entertainmenthobbyleisure,Østfold', 'MOTOR,,Hedmark', 'BAP,antiques', 'BOAT,,Møre og Romsdal']
Total Valid Mapped Categories: 289


In [67]:
print("Find the itemId that were click by user 5 in interaction 3:")
itemId = [data['click'][5,3]]
print(f"itemId: {itemId}")

print("\nFind the category of that item in itemattr:")
category_str = ind2val["category"].get(str(int(category[itemId])), "<UNK>")
print(f"Category for item {itemId}: {category_str}")


Find the itemId that were click by user 5 in interaction 3:
itemId: [np.int64(637590)]

Find the category of that item in itemattr:
Category for item [np.int64(637590)]: REAL_ESTATE,,Oppland


  category_str = ind2val["category"].get(str(int(category[itemId])), "<UNK>")
