In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn import CrossEntropyLoss

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
    
from scipy import sparse
from os import path

import pickle

## Load raw data

In [3]:
import os
export_dir = os.getcwd()

In [7]:
HOME_DIR = "/recsys_LXR"
ML_DIR = "/Data

In [8]:
files_path = export_dir + ML_DIR

In [9]:
# Load the data
data = pd.read_csv(files_path + "/ratings.dat", sep="::", engine="python",
                   names=["user_id_original", "item_id_original", "rating", "timestamp"])

In [None]:
data.head()

## Getting some data statistics on users' history

In [10]:
sum(data.groupby('user_id_original')['item_id_original'].count() < 100)

3095

In [11]:
np.mean(data.groupby('user_id_original')['item_id_original'].count())

165.5975165562914

In [12]:
# Convert the ratings to binary values (1 if rating exists, 0 otherwise)
data["rating"] = data["rating"].apply(lambda x: 1 if x > 0 else 0)


# Encode target values
data["user_id"] = LabelEncoder().fit_transform(data.user_id_original)
data["item_id"] = LabelEncoder().fit_transform(data.item_id_original)

# Get the number of users and items in the dataset
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

In [13]:
data.item_id.nunique()

3706

## Preprocessing for baselines calculation

In [14]:
# Calculate the Jaccard similarity between two sets of genres
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

In [15]:
user_group = data[["user_id","item_id"]].groupby(data.user_id)

users_data = pd.DataFrame(
    data={
        "user_id": list(user_group.groups.keys()),
        "item_ids": list(user_group.item_id.apply(list)),
    }    
)

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
user_one_hot = pd.DataFrame(mlb.fit_transform(users_data["item_ids"]),columns=mlb.classes_, index=users_data["item_ids"].index)

In [17]:
user_one_hot["user_id"]=users_data["user_id"]

In [18]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3697,3698,3699,3700,3701,3702,3703,3704,3705,user_id
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6035
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6036
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6037
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6038


In [19]:
user_one_hot.iloc[:,:-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Positive samples for training

In [18]:
#Sample items from consumed items to predict (positive examples)
np.random.seed(3)
def get_y_values(numpy_arr):
    y_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
          y_values.append(int(np.random.choice(u[:-1],1,replace=False)))
    return y_values    

In [19]:
y_indices = get_y_values(np.argwhere(user_one_hot.to_numpy()>0))

In [20]:
len(y_indices)

6040

In [21]:
user_one_hot["y_positive"]= y_indices

In [22]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3698,3699,3700,3701,3702,3703,3704,3705,user_id,y_positive
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,708
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1550
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,1327
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2173
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,6035,1581
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6036,1537
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6037,225
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6038,997


In [23]:
#Create representation of each item as a binary vector
items_values = pd.DataFrame(np.eye(num_items,dtype=int), columns=np.arange(num_items))

In [24]:
items_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3703,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3704,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
items_values.iloc[581][581]

1

In [26]:
items_values.index.values

array([   0,    1,    2, ..., 3703, 3704, 3705])

In [27]:
items_values_dict = {}
for i in range(items_values.shape[0]):
    items_values_dict[i] = items_values.iloc[i,:]

In [34]:
# Output doctionaries for further easier calculations
# Items mapping to one-hot encoded vectors
file_path = 'items_values_dict_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(items_values_dict, f)   

In [28]:
items_values_dict[3704]

0       0
1       0
2       0
3       0
4       0
       ..
3701    0
3702    0
3703    0
3704    1
3705    0
Name: 3704, Length: 3706, dtype: int64

In [36]:
data.user_id.unique()

array([   0,    1,    2, ..., 6037, 6038, 6039])

## Split to training and test set

In [31]:
random_state = 12
# The list of users IDs
users_indices = data.user_id.unique()

# Set the split ratio (80% for training, 20% for testing)
split_ratio = 0.8

# Calculate the split index
shuffled_users_ids = np.random.permutation(users_indices)
split_index = int(len(shuffled_users_ids) * split_ratio)

# Split the list of user IDs into training and testing sets
train_user_ids = shuffled_users_ids[:split_index]
test_user_ids = shuffled_users_ids[split_index:]


In [32]:
# Split the user hot encoding matrix into training and testing sets based on the selected user IDs
train_data = user_one_hot.loc[train_user_ids]
test_data = user_one_hot.loc[test_user_ids]

In [33]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3698,3699,3700,3701,3702,3703,3704,3705,user_id,y_positive
1991,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1991,2166
3856,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3856,2931
4619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4619,1848
3980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3980,851
1079,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1079,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5964,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5964,2007
5764,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,5764,1166
3140,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3140,1258
3495,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3495,3197


In [34]:
training_data = data[data.user_id.isin(train_user_ids)]

# Group by item_id and aggregate user_id values into a list
item_group = training_data.groupby("item_id")

items_data = pd.DataFrame(
    data={
        "item_id": list(item_group.groups.keys()),
        "users_ids": list(item_group.user_id.apply(list)),
    }    
)

In [35]:
items_data 

Unnamed: 0,item_id,users_ids
0,0,"[0, 5, 7, 8, 9, 27, 35, 37, 43, 44, 47, 48, 50..."
1,1,"[9, 26, 43, 47, 52, 59, 61, 74, 82, 91, 113, 1..."
2,2,"[44, 61, 136, 152, 162, 165, 168, 198, 201, 22..."
3,3,"[7, 154, 186, 198, 202, 224, 328, 337, 345, 47..."
4,4,"[47, 154, 168, 201, 236, 260, 283, 300, 301, 3..."
...,...,...
3673,3701,"[8, 9, 14, 37, 47, 51, 55, 57, 60, 64, 80, 89,..."
3674,3702,"[29, 115, 148, 149, 166, 168, 172, 191, 194, 2..."
3675,3703,"[172, 194, 318, 481, 622, 623, 677, 744, 849, ..."
3676,3704,"[172, 194, 410, 592, 829, 837, 849, 855, 861, ..."


In [42]:
#calculate similarity between items based on users that consumed these items (Jaccard similarity)
def create_user_based_Jaccard_sim():

    # Calculate the items similarity based on users that consumed these items
    item_similarities = {}
    for index1, row1 in items_data.iterrows():
        for index2, row2 in items_data.iterrows():
            if row1["item_id"]!= row2["item_id"]: #and item2>item1:
                similarity = jaccard_similarity(set(row1["users_ids"]), set(row2["users_ids"]))
            else:
                similarity = 1
            item_similarities[(row1["item_id"], row2["item_id"])] = similarity

    return item_similarities 

In [75]:
user_similarities_Jaccard = create_user_based_Jaccard_sim()

In [76]:
import pickle

file_path = 'user_similarities_Jaccard_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(user_similarities_Jaccard, f)
#with open(file_path, 'rb') as f:
    #user_similarities_Jaccard = pickle.load(f)

In [36]:
# Use MultiLabelBinarizer to encode the user IDs for each item into a one-hot matrix
mlb = MultiLabelBinarizer(classes=train_user_ids)  # Only include train_user_ids
item_one_hot = pd.DataFrame(
    mlb.fit_transform(items_data["users_ids"]),
    columns=mlb.classes_,
    index=items_data["item_id"]
)

In [37]:
len(train_user_ids)

4832

In [38]:
item_one_hot

Unnamed: 0_level_0,1991,3856,4619,3980,1079,860,4191,4356,552,3341,...,5111,3015,167,1164,3053,5964,5764,3140,3495,917
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3703,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3704,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1]:
from sklearn.metrics.pairwise import cosine_similarity
#Item similarity based on cosine between user and item
def item_user_based_cos_sim():
        return cosine_similarity(item_one_hot)

In [49]:
cosine_items = item_user_based_cos_sim()

In [50]:
cosine_items

array([[1.        , 0.39954745, 0.29506839, ..., 0.11228692, 0.0386783 ,
        0.18949644],
       [0.39954745, 1.        , 0.25508484, ..., 0.09707137, 0.01486096,
        0.14142607],
       [0.29506839, 0.25508484, 1.        , ..., 0.08029411, 0.02681995,
        0.12194571],
       ...,
       [0.11228692, 0.09707137, 0.08029411, ..., 1.        , 0.20412415,
        0.25900956],
       [0.0386783 , 0.01486096, 0.02681995, ..., 0.20412415, 1.        ,
        0.20817604],
       [0.18949644, 0.14142607, 0.12194571, ..., 0.25900956, 0.20817604,
        1.        ]])

In [51]:
cosine_items_dict = {}

# Loop through the rows and columns of the ndarray and add each element to the dictionary
for i in range(cosine_items.shape[0]):
    for j in range(cosine_items.shape[1]):
        cosine_items_dict[(i, j)] = cosine_items[i][j]

In [52]:
cosine_items_dict[(0,0)]

1.0000000000000009

In [53]:
import pickle

file_path = 'cosine_items_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(cosine_items_dict, f)

In [55]:
cosine_items = cosine_items_dict

In [39]:
#Add negative examples to training data
#Calculate the popularity of each item in the training set
popularity_dict = train_data.iloc[:,:-2].sum(axis=0).to_dict()

In [40]:
popularity_dict[3705] 

318

In [41]:
import pickle
with open('pop_dict.pkl', 'wb') as f:
    pickle.dump(popularity_dict, f)

In [42]:
np.sum(train_data.iloc[:,3704])

32

In [43]:
prob_dict = {}
for k, v in popularity_dict.items():
    prob_dict[k] = v/sum(popularity_dict.values())
print(len(prob_dict))

3706


In [74]:
import pickle
with open('prob_dict.pkl', 'wb') as f:
    pickle.dump(prob_dict, f)

In [44]:
import operator
sorted(prob_dict.items(), key=operator.itemgetter(1),reverse=True)[0]

(2651, 0.003436319617393331)

In [45]:
#Sample negative points for training
def get_negative_samples(numpy_arr,num):
    negative_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
        items_from_dict_keys = [d for d in popularity_dict.keys() if d in u]
        sum_popularity= 0
        for it in items_from_dict_keys:
            sum_popularity += popularity_dict[it] 
        items_probs = [popularity_dict[d]/sum_popularity for d in items_from_dict_keys]
        negative_samples = np.random.choice(items_from_dict_keys,size=num,replace=False,p=items_probs)
        if(num == 1):
            negative_values.append(int(negative_samples))
        else:
            negative_values.append(negative_samples) 
                               
    return negative_values   

In [46]:
y_negative = get_negative_samples(np.argwhere(train_data.iloc[:,:-2].to_numpy()==0), num=1)

In [47]:
train_data["y_negative"] = y_negative

In [48]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3699,3700,3701,3702,3703,3704,3705,user_id,y_positive,y_negative
1991,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1991,2166,3412
3856,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3856,2931,2374
4619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4619,1848,1260
3980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3980,851,2736
1079,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1079,1,1215


In [49]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3699,3700,3701,3702,3703,3704,3705,user_id,y_positive,y_negative
1991,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1991,2166,3412
3856,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3856,2931,2374
4619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4619,1848,1260
3980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3980,851,2736
1079,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1079,1,1215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5964,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5964,2007,2709
5764,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,5764,1166,983
3140,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,3140,1258,3353
3495,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,3495,3197,2203


In [53]:
#Positive and Negative examples merged together
train_data_mixed = train_data.merge(train_data.loc[:,['user_id','y_positive','y_negative']].melt('user_id', value_name='y_values').replace({'y_positive': 1, 'y_negative': 0}), on="user_id").rename(columns={'variable': 'interaction'}).drop(['y_positive', 'y_negative'],axis=1)

In [54]:
train_data_mixed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3699,3700,3701,3702,3703,3704,3705,user_id,interaction,y_values
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1991,1,2166
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1991,0,3412
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3856,1,2931
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3856,0,2374
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4619,1,1848


In [69]:
train_data_mixed.to_csv('train_data_mixed.csv', index=False)

In [55]:
train_data_mixed.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3699,3700,3701,3702,3703,3704,3705,user_id,interaction,y_values
count,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,...,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0,9664.0
mean,0.350166,0.117136,0.080919,0.028353,0.051118,0.158113,0.077194,0.010969,0.018005,0.149627,...,0.016763,0.008899,0.147558,0.050911,0.009934,0.006623,0.065811,3033.195985,0.5,1761.331126
std,0.477046,0.321599,0.272725,0.165987,0.220249,0.364865,0.266912,0.10416,0.132976,0.356724,...,0.12839,0.093919,0.35468,0.219827,0.099177,0.081113,0.247965,1752.061574,0.500026,1026.351268
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1503.5,0.0,982.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3038.5,0.5,1711.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4559.25,1.0,2624.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6039.0,1.0,3705.0


In [72]:
test_data.to_csv("test_data.csv", index = False)

In [None]:
#Create tf_idf file upon training data interactions

In [56]:
train_array = train_data_mixed.to_numpy()
test_array = test_data.to_numpy()

In [57]:
# Tf/idf calculation
import numpy as np

train_data =(train_array[train_array[:,-2]==0][:,:-3]).T
test_data = (test_array[:,:-2]).T

# Compute the IDF scores for each item in the train data
num_docs = train_data.shape[0]
idf = np.log(num_docs / (np.sum(train_data, axis=0) + 1))

# Compute the TF-IDF scores for each item in the train data
tfidf_matrix = np.zeros(train_data.shape)
for i in range(train_data.shape[1]):
    tf = train_data[:, i] / np.sum(train_data[:, i])
    tfidf_matrix[:, i] = tf * idf[i]  

tf_idf_items_dict = {(i, j): tfidf_matrix[i, j] for i in range(tfidf_matrix.shape[0]) for j in range(tfidf_matrix.shape[1])}

In [58]:
import pickle

file_path = 'tf_idf_items_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(file_path, 'wb') as f:
    pickle.dump(tf_idf_items_dict, f)

## Recommender model training

In [None]:
class MLP_G(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MLP_G, self).__init__()
        self.linear_x = nn.Linear(input_size, hidden_size, bias = False)
        self.linear_y = nn.Linear(input_size, hidden_size, bias = False)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user, item):
        user_representation = self.linear_x(user.float())
        item_representation = self.linear_y(item.float())
        dot_prod = torch.matmul(user_representation, item_representation.T)
        dot_sigmoid = self.sigmoid(dot_prod)
        
        return dot_sigmoid

In [None]:
class Recommender_G(nn.Module):
    def __init__(self, num_items, hidden_size):
        super(Recommender_G, self).__init__()
        self.mlp = MLP_G(num_items, hidden_size).to(device)

    def forward(self, user_vector, item_vector):
        user_vector = user_vector.to(device)
        item_vector = item_vector.to(device)
        output = self.mlp(user_vector, item_vector)
        return output.to(device)

In [None]:
def get_top_k(user_vector, original_user_vector, num_items, model, top_k):
    item_prob_dict = {}
    user_tensor = torch.Tensor(user_vector).to(device)
    item_tensor = torch.FloatTensor(items_array).to(device)
    output_model = [float(i) for i in model(user_tensor, item_tensor).cpu().detach().numpy()]
    
    original_user_vector = np.array(original_user_vector.cpu())
    neg = np.ones_like(original_user_vector)- original_user_vector
    output = neg*output_model
    for i in range(len(output)):
        item_prob_dict[i]=output[i]

    sorted_items_by_prob  = sorted(item_prob_dict.items(), key=lambda item: item[1],reverse=True)

    return dict(sorted_items_by_prob[0:top_k])

In [None]:
# Train the model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_dim = 20
print("num_users is ", num_users)
print("num_items is ", num_items)
recommender_model_g = Recommender_G(num_items, hidden_dim)
recommender_loss_g = nn.BCELoss()
recommender_optimizer_g = torch.optim.Adam(recommender_model_g.parameters(), lr=0.001)

In [None]:
train_losses = []
epochs = 10


for epoch in range(epochs):
    if epoch==20:
        recommender_optimizer.lr=0.0001
    train_loss = 0
    for i in range(train_array.shape[0]):
        item_id = train_array[i][-1]
        user_id = train_array[i][-3]
        item_vector = items_values_dict[item_id]
        user_vector = train_array[i][:-3]
        user_vector[item_id] = 0 
        interact = train_array[i][-2]
        
        user_tensor = torch.FloatTensor(user_vector).to(device)
        item_tensor = torch.FloatTensor(item_vector).to(device)
      
        interact_tensor = torch.FloatTensor([interact]).to(device)
        interact_tensor = torch.tensor(interact_tensor.item()).to(device)
        
        recommender_optimizer_g.zero_grad()
        recommender_output = (recommender_model_g(user_tensor, item_tensor)).to(device)
        rec_loss = recommender_loss_g(recommender_output, interact_tensor)
        
        train_loss+=rec_loss.item()
            
        rec_loss.backward()
        recommender_optimizer_g.step()
    train_losses.append(train_loss/train_array.shape[0])
    print(f"Epoch {epoch}, Train Loss {train_loss/train_array.shape[0]:.4f}")

In [None]:
torch.save(recommender_model_g.state_dict(), 'recommender_model.pt')

In [None]:
#Froze recommender
for param in recommender_model_g.parameters():
    param.requires_grad= False

In [None]:
#Get users vectors to create topk
unique_indices = np.unique(train_array[:,-3], return_index=True, axis=0)[1]

# create a new array with only the unique users
train_unique_arr = train_array[unique_indices, :]
items_array = items_values.to_numpy()

In [None]:
#Create top_k dictionary that contains user, item, score for the rec_model
#and is stored as (user_id, item_id, score) values
def create_topk_data(data_, rec_model, data_type = "train"):

    user_item_matrix = np.zeros((num_users, num_items))
    
    for i in range(data_.shape[0]):

        if(data_type == "train"):
            user_id = data_[i][-3]
            user_vector = data_[i][:-3]
        elif(data_type == "test"):
            user_id = data_[i][-2]
            user_vector = data_[i][:-2]
            
        user_tensor = torch.FloatTensor(user_vector).to(device)
            
        top_items = get_top_k(user_tensor, user_tensor, num_items, rec_model, num_items)
      
        for item_id in top_items.keys(): 
            user_item_matrix[user_id, item_id] = top_items[item_id]
        
        if(i%100 == 0):
            print(i)
            
    return user_item_matrix


In [None]:
topk_train = create_topk_data(train_unique_arr, rec_model, data_type = "train")

In [None]:
topk_test = create_topk_data(test_array, rec_model, data_type = "test")

In [None]:
filename = 'topk_train.pkl'

with open(filename, 'wb') as f:
    pickle.dump(topk_train, f)

In [None]:
filename = 'topk_test.pkl'

with open(filename, 'wb') as f:
    pickle.dump(topk_test, f)