In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn import CrossEntropyLoss

In [2]:
from sklearn.decomposition import NMF
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
    
from scipy import sparse
from os import path

## Loading data

In [7]:
HOME_DIR = "/recsys_LTX"
ML_DIR = "/Data"

In [8]:
import os
export_dir = os.getcwd()

In [5]:
files_path = export_dir + HOME_DIR  + ML_DIR

In [6]:
# Load the data
data = pd.read_csv( files_path +"/yahoo_music_subset.csv", sep=",", engine="python",
                   names=["user_id_original","item_id_original", "rating"])

## Data statistics

In [11]:
data.head()

Unnamed: 0,user_id_original,item_id_original,rating
0,1,1000125,90
1,1,1006373,100
2,1,1006978,90
3,1,1007035,100
4,1,1007098,100


In [12]:
data.shape

(687590, 3)

In [13]:
#Users with one or two items in the interaction
user_ids = list(data.groupby('user_id_original')['item_id_original'].count()[lambda x: x < 3].index)

In [14]:
#remove such users from the data set
filtered_data = data[~data['user_id_original'].isin(user_ids)]

In [15]:
#Verification 
print(filtered_data.user_id_original.nunique()+len(user_ids) == data.user_id_original.nunique())

True


In [16]:
data = filtered_data

In [17]:
np.mean(data.groupby('user_id_original')['item_id_original'].count())

49.864116575591986

In [18]:
# Convert the ratings to binary values (1 if rating exists, 0 otherwise)
data["rating"] = data["rating"].apply(lambda x: 1 if x > 0 else 0)


# Encode target values
data["user_id"] = LabelEncoder().fit_transform(data.user_id_original)
data["item_id"] = LabelEncoder().fit_transform(data.item_id_original)

# Get the number of users and items in the dataset
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

In [19]:
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

In [20]:
num_users

13725

In [21]:
num_items

10265

## Data processing 

In [22]:
user_group = data[["user_id","item_id"]].groupby(data.user_id)

users_data = pd.DataFrame(
    data={
        "user_id": list(user_group.groups.keys()),
        "item_ids": list(user_group.item_id.apply(list)),
    }    
)

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
user_one_hot = pd.DataFrame(mlb.fit_transform(users_data["item_ids"]),columns=mlb.classes_, index=users_data["item_ids"].index)

In [18]:
user_one_hot["user_id"]=users_data["user_id"]

In [19]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10256,10257,10258,10259,10260,10261,10262,10263,10264,user_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13720,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13720
13721,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13721
13722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13722
13723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13723


In [20]:
user_one_hot.iloc[:,:-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10255,10256,10257,10258,10259,10260,10261,10262,10263,10264
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13720,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13721,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#Sample items from consumed items to predict (positive examples)
np.random.seed(3)
def get_y_values(numpy_arr):
    y_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
          y_values.append(int(np.random.choice(u[:-1],1,replace=False)))
    return y_values    

In [22]:
y_indices = get_y_values(np.argwhere(user_one_hot.to_numpy()>0))

In [23]:
len(y_indices)

13725

In [24]:
user_one_hot["y_positive"]= y_indices

In [26]:
#Create representation of each item as a binary vector
items_values = pd.DataFrame(np.eye(num_items,dtype=int), columns=np.arange(num_items))

In [41]:
items_values.to_csv("items_values_Yahoo.csv", index = False)

In [28]:
items_values.index.values

array([    0,     1,     2, ..., 10262, 10263, 10264])

In [29]:
items_values_dict = {}
for i in range(items_values.shape[0]):
    items_values_dict[i] = items_values.iloc[i,:]

In [45]:
import pickle

file_path = 'items_values_dict_Yahoo.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(items_values_dict, f)

In [30]:
data.user_id.unique()

array([    0,     1,     2, ..., 13722, 13723, 13724])

## Split to training and testing

In [31]:
random_state = 12
# The list of users IDs
users_indices = data.user_id.unique()

# Set the split ratio (80% for training, 20% for testing)
split_ratio = 0.8

# Calculate the split index
shuffled_users_ids = np.random.permutation(users_indices)
split_index = int(len(shuffled_users_ids) * split_ratio)

# Split the list of user IDs into training and testing sets
train_user_ids = shuffled_users_ids[:split_index]
test_user_ids = shuffled_users_ids[split_index:]


In [32]:
train_user_ids

array([ 7987, 10191,  3867, ...,  1294,  5314,  5833])

In [33]:
# Split the user hot encoding matrix into training and testing sets based on the selected user IDs
train_data = user_one_hot.loc[train_user_ids]
test_data = user_one_hot.loc[test_user_ids]

In [34]:
test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10257,10258,10259,10260,10261,10262,10263,10264,user_id,y_positive
4891,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4891,3025
12745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12745,7426
9355,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9355,5693
6963,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6963,6269
10601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10601,6542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12514,5069
12872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12872,9205
12987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12987,2543
2833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2833,9914


In [35]:
training_data = data[data.user_id.isin(train_user_ids)]

# Group by item_id and aggregate user_id values into a list
item_group = training_data.groupby("item_id")

items_data = pd.DataFrame(
    data={
        "item_id": list(item_group.groups.keys()),
        "users_ids": list(item_group.user_id.apply(list)),
    }    
)

In [36]:
items_data 

Unnamed: 0,item_id,users_ids
0,0,"[61, 62, 160, 303, 318, 472, 665, 689, 801, 12..."
1,1,"[418, 725, 1177, 1718, 1761, 1960, 2070, 2856,..."
2,2,"[56, 135, 598, 1207, 1418, 1570, 1789, 2045, 2..."
3,3,"[1980, 6194, 10376]"
4,4,"[38, 224, 341, 420, 460, 488, 555, 598, 744, 7..."
...,...,...
9615,10260,"[14, 7061, 8959, 9258, 10421]"
9616,10261,"[2025, 2198, 3085, 3440, 3893, 4709, 5237, 706..."
9617,10262,"[1394, 4709, 5896, 7524, 7908, 8133, 10811]"
9618,10263,"[154, 168, 529, 1135, 1602, 2385, 3622, 3641, ..."


## Creating files for baseline calculation

In [None]:
# Calculate the Jaccard similarity between two sets of genres
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

In [53]:
#calculate similarity between items based on users that consumed these items (Jaccard similarity)
def create_user_based_Jaccard_sim():

    # Calculate the items similarity based on users that consumed these items
    item_similarities = {}
    for index1, row1 in items_data.iterrows():
        for index2, row2 in items_data.iterrows():
            if row1["item_id"]!= row2["item_id"]: #and item2>item1:
                similarity = jaccard_similarity(set(row1["users_ids"]), set(row2["users_ids"]))
            else:
                similarity = 1
            item_similarities[(row1["item_id"], row2["item_id"])] = similarity

    return item_similarities 


In [54]:
user_similarities_Jaccard = create_user_based_Jaccard_sim()

In [55]:
import pickle

file_path = 'user_similarities_Jaccard_Yahoo.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(user_similarities_Jaccard, f)

In [37]:
# Use MultiLabelBinarizer to encode the user IDs for each item into a one-hot matrix
mlb = MultiLabelBinarizer(classes=train_user_ids)  # Only include train_user_ids
item_one_hot = pd.DataFrame(
    mlb.fit_transform(items_data["users_ids"]),
    columns=mlb.classes_,
    index=items_data["item_id"]
)

In [38]:
training_data.item_id.nunique()

9620

In [39]:
item_one_hot

Unnamed: 0_level_0,7987,10191,3867,11711,4369,1149,3645,1200,11361,6591,...,9794,6689,4537,292,4149,4527,13270,1294,5314,5833
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10263,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
from sklearn.metrics.pairwise import cosine_similarity
#Item similarity based on cosine between user and item
def item_user_based_cos_sim():
        return cosine_similarity(item_one_hot)

In [60]:
cosine_items = item_user_based_cos_sim()

In [61]:
cosine_items

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.15430335],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.15430335, 0.        ,
        1.        ]])

In [62]:
cosine_items_dict = {}

# Loop through the rows and columns of the ndarray and add each element to the dictionary
for i in range(cosine_items.shape[0]):
    for j in range(cosine_items.shape[1]):
        cosine_items_dict[(i, j)] = cosine_items[i][j]

In [63]:
cosine_items_dict[(0,0)]

0.9999999999999996

In [64]:
import pickle

file_path = 'cosine_items_Yahoo.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(cosine_items_dict, f)

In [65]:
cosine_items = cosine_items_dict

In [40]:
#Add negative examples to training data
#Calculate the popularity of each item in the training set
popularity_dict = train_data.iloc[:,:-2].sum(axis=0).to_dict()

In [41]:
import pickle

file_path = 'pop_dict_Yahoo.pkl'

with open(file_path, 'wb') as f:
    pickle.dump(popularity_dict,f)  

In [42]:
prob_dict = {}
for k, v in popularity_dict.items():
    prob_dict[k] = v/sum(popularity_dict.values())
print(len(prob_dict))

10265


In [43]:
import pickle

file_path = 'prob_dict_Yahoo.pkl'

with open(file_path, 'wb') as f:
    pickle.dump(prob_dict,f)  

In [44]:
import operator
sorted(prob_dict.items(), key=operator.itemgetter(1),reverse=True)[0]

(7898, 0.009128906551289727)

In [45]:
#sample negative points 
def get_negative_samples(numpy_arr,num):
    negative_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
        items_from_dict_keys = [d for d in popularity_dict.keys() if d in u]
        sum_popularity= 0
        for it in items_from_dict_keys:
            sum_popularity += popularity_dict[it] 
        items_probs = [popularity_dict[d]/sum_popularity for d in items_from_dict_keys]
        negative_samples = np.random.choice(items_from_dict_keys,size=num,replace=False,p=items_probs)
        if(num == 1):
            negative_values.append(int(negative_samples))
        else:
            negative_values.append(negative_samples) 
                               
    return negative_values   

In [46]:
y_negative = get_negative_samples(np.argwhere(train_data.iloc[:,:-2].to_numpy()==0), num=1)

In [47]:
train_data["y_negative"] = y_negative

In [48]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10258,10259,10260,10261,10262,10263,10264,user_id,y_positive,y_negative
7987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7987,5303,3309
10191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10191,7053,2273
3867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3867,7389,321
11711,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11711,10027,9007
4369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4369,2651,6148


In [49]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10258,10259,10260,10261,10262,10263,10264,user_id,y_positive,y_negative
7987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7987,5303,3309
10191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10191,7053,2273
3867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3867,7389,321
11711,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11711,10027,9007
4369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4369,2651,6148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4527,7736,5393
13270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,13270,453,8784
1294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1294,7285,1406
5314,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5314,3245,9318


In [50]:
#Positive and Negative examples merged together
train_data_mixed = train_data.merge(train_data.loc[:,['user_id','y_positive','y_negative']].melt('user_id', value_name='y_values').replace({'y_positive': 1, 'y_negative': 0}), on="user_id").rename(columns={'variable': 'interaction'}).drop(['y_positive', 'y_negative'],axis=1)

In [51]:
train_data_mixed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10258,10259,10260,10261,10262,10263,10264,user_id,interaction,y_values
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7987,1,5303
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7987,0,3309
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10191,1,7053
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10191,0,2273
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3867,1,7389


In [78]:
train_data_mixed.to_csv('train_data_mixed_Yahoo.csv', index=False)

In [79]:
test_data.to_csv("test_data_Yahoo.csv", index = False)

In [None]:
train_array = train_data_mixed.to_numpy()
test_array = test_data.to_numpy()

In [None]:
import numpy as np

train_data =(train_array[train_array[:,-2]==0][:,:-3]).T
test_data = (test_array[:,:-2]).T

# Compute the IDF scores for each item in the train data
num_docs = train_data.shape[0]
idf = np.log(num_docs / (np.sum(train_data, axis=0) + 1))

# Compute the TF-IDF scores for each item in the train data
tfidf_matrix = np.zeros(train_data.shape)
for i in range(train_data.shape[1]):
    tf = train_data[:, i] / np.sum(train_data[:, i])
    tfidf_matrix[:, i] = tf * idf[i]  

tf_idf_items_dict = {(i, j): tfidf_matrix[i, j] for i in range(tfidf_matrix.shape[0]) for j in range(tfidf_matrix.shape[1])}

In [None]:
import pickle

file_path = 'tf_idf_items_Yahoo.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(tf_idf_items_dict, f)

## Prepare Models

In [None]:
class MLP_G(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MLP_G, self).__init__()
        self.linear_x = nn.Linear(input_size, hidden_size, bias = False)
        self.linear_y = nn.Linear(input_size, hidden_size, bias = False)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user, item):
        user_representation = self.linear_x(user.float())
        item_representation = self.linear_y(item.float())
        dot_prod = torch.matmul(user_representation, item_representation.T)
        dot_sigmoid = self.sigmoid(dot_prod)
        
        return dot_sigmoid

In [None]:
class Recommender_G(nn.Module):
    def __init__(self, num_items, hidden_size):
        super(Recommender_G, self).__init__()
        self.mlp = MLP_G(num_items, hidden_size).to(device)

    def forward(self, user_vector, item_vector):
        user_vector = user_vector.to(device)
        item_vector = item_vector.to(device)
        output = self.mlp(user_vector, item_vector)
        return output.to(device)

In [None]:
def get_top_k(user_vector, original_user_vector, num_items, model, top_k):
    item_prob_dict = {}
    user_tensor = torch.Tensor(user_vector).to(device)
    item_tensor = torch.FloatTensor(items_array).to(device)
    output_model = [float(i) for i in model(user_tensor, item_tensor).cpu().detach().numpy()]
    
    original_user_vector = np.array(original_user_vector.cpu())
    neg = np.ones_like(original_user_vector)- original_user_vector
    output = neg*output_model
    for i in range(len(output)):
        item_prob_dict[i]=output[i]

    sorted_items_by_prob  = sorted(item_prob_dict.items(), key=lambda item: item[1],reverse=True)

    return dict(sorted_items_by_prob[0:top_k])

## Train data on these models

In [None]:
# Train the model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_dim = 20
print("num_users is ", num_users)
print("num_items is ", num_items)
recommender_model_g = Recommender_G(num_items, hidden_dim)
recommender_loss_g = nn.BCELoss()
recommender_optimizer_g = torch.optim.Adam(recommender_model_g.parameters(), lr=0.001)

In [None]:
train_losses = []
epochs = 10


for epoch in range(epochs):
    if epoch==20:
        recommender_optimizer.lr=0.0001
    train_loss = 0
    for i in range(train_array.shape[0]):
        item_id = train_array[i][-1]
        user_id = train_array[i][-3]
        item_vector = items_values_dict[item_id]
        user_vector = train_array[i][:-3]
        user_vector[item_id] = 0 
        interact = train_array[i][-2]
        
        user_tensor = torch.FloatTensor(user_vector).to(device)
        item_tensor = torch.FloatTensor(item_vector).to(device)
      
        interact_tensor = torch.FloatTensor([interact]).to(device)
        interact_tensor = torch.tensor(interact_tensor.item()).to(device)
        
        recommender_optimizer_g.zero_grad()
        recommender_output = (recommender_model_g(user_tensor, item_tensor)).to(device)
        rec_loss = recommender_loss_g(recommender_output, interact_tensor)
        
        train_loss+=rec_loss.item()
            
        rec_loss.backward()
        recommender_optimizer_g.step()
    train_losses.append(train_loss/train_array.shape[0])
    print(f"Epoch {epoch}, Train Loss {train_loss/train_array.shape[0]:.4f}")

In [87]:
torch.save(recommender_model_g.state_dict(), 'recommender_model_yahoo.pt')

In [88]:
#Froze recommender
for param in recommender_model_g.parameters():
    param.requires_grad= False

In [89]:
rec_model = recommender_model_g

## Save top k items in training and test set

In [90]:
#Get users vectors to create topk
unique_indices = np.unique(train_array[:,-3], return_index=True, axis=0)[1]

# create a new array with only the unique users
train_unique_arr = train_array[unique_indices, :]
items_array = items_values.to_numpy()

In [91]:
#Create top_k dictionary that contains user, item, score for the rec_model
#and is stored as (user_id, item_id, score) values
def create_topk_data(data_, rec_model, data_type = "train"):

    user_item_matrix = np.zeros((num_users, num_items))
    
    for i in range(data_.shape[0]):

        if(data_type == "train"):
            user_id = data_[i][-3]
            user_vector = data_[i][:-3]
        elif(data_type == "test"):
            user_id = data_[i][-2]
            user_vector = data_[i][:-2]
            
        user_tensor = torch.FloatTensor(user_vector).to(device)
            
        top_items = get_top_k(user_tensor, user_tensor, num_items, rec_model, num_items)
      
        for item_id in top_items.keys(): 
            user_item_matrix[user_id, item_id] = top_items[item_id]
        
        if(i%100 == 0):
            print(i)
            
    return user_item_matrix


In [24]:
topk_train = create_topk_data(train_unique_arr, rec_model, data_type = "train")

In [25]:
topk_test = create_topk_data(test_array, rec_model, data_type = "test")

In [94]:
filename = 'topk_train_Yahoo.pkl'

# open the file in write-binary mode and save the array
with open(filename, 'wb') as f:
    pickle.dump(topk_train, f)

In [95]:
filename = 'topk_test_Yahoo.pkl'

# open the file in write-binary mode and save the array
with open(filename, 'wb') as f:
    pickle.dump(topk_test, f)