In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn import CrossEntropyLoss

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse
import os
from os import path
from pathlib import Path
import pickle
import operator

SEED = 3
np.random.seed(SEED)

## Load data

In [3]:
ML_DIR = "Data"
export_dir = Path(os.getcwd())
files_path = Path(export_dir.parent, ML_DIR)

In [4]:
print(export_dir)

/media/dready/Data/dready/LXR/Data_preprocessing


In [5]:
data = pd.read_csv(Path(files_path, "pinterest-20.train.500krating.csv"), engine="python",
                   names=["user_id_original", "item_id_original", "interaction"])

In [6]:
data.head()

Unnamed: 0,user_id_original,item_id_original,interaction
0,0,2,1
1,0,3,1
2,0,4,1
3,0,5,1
4,0,6,1


In [7]:
data.describe()

Unnamed: 0,user_id_original,item_id_original,interaction
count,500001.0,500001.0,500001.0
mean,9569.492157,3754.179276,1.0
std,5544.849093,2430.101642,0.0
min,0.0,0.0,1.0
25%,4729.0,1618.0,1.0
50%,9518.0,3398.0,1.0
75%,14408.0,5717.0,1.0
max,19154.0,9647.0,1.0


## Getting some data statistics on users' history

In [8]:
sum(data.groupby('user_id_original')['item_id_original'].count() < 100)

19155

In [9]:
np.mean(data.groupby('user_id_original')['item_id_original'].count())

26.102897415818322

In [12]:
# Encode target values
data["user_id"] = LabelEncoder().fit_transform(data.user_id_original)
data["item_id"] = LabelEncoder().fit_transform(data.item_id_original)

# Get the number of users and items in the dataset
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

#### N unique items

In [13]:
data.item_id.nunique()

9639

## Preprocessing for baselines calculation

In [14]:
def jaccard_similarity(set1, set2):
    """
    Calculate the Jaccard similarity between two sets 
    """
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

In [15]:
user_group = data[["user_id","item_id"]].groupby(data.user_id)

users_data = pd.DataFrame(
    data={
        "user_id": list(user_group.groups.keys()),
        "item_ids": list(user_group.item_id.apply(list)),
    }    
)

In [16]:
mlb = MultiLabelBinarizer()
user_one_hot = pd.DataFrame(mlb.fit_transform(users_data["item_ids"]),columns=mlb.classes_, index=users_data["item_ids"].index)

In [17]:
user_one_hot["user_id"] = users_data["user_id"]

In [18]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9630,9631,9632,9633,9634,9635,9636,9637,9638,user_id
0,1,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19150
19151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19151
19152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19152
19153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19153


In [19]:
user_one_hot.iloc[:,:-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9629,9630,9631,9632,9633,9634,9635,9636,9637,9638
0,1,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Positive samples for training

In [20]:
def get_y_values(numpy_arr):
    """
    Sample items from consumed items to predict (positive examples)
    """
    y_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
          y_values.append(int(np.random.choice(u[:-1],1,replace=False)))
    return y_values    

In [21]:
y_indices = get_y_values(np.argwhere(user_one_hot.to_numpy()>0))

In [22]:
len(y_indices)

19155

In [23]:
user_one_hot["y_positive"] = y_indices

In [24]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9631,9632,9633,9634,9635,9636,9637,9638,user_id,y_positive
0,1,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,21
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,29
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,57
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,86
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19150,7554
19151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19151,628
19152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19152,609
19153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19153,2295


#### Create representation of each item as a binary vector

In [25]:
items_values = pd.DataFrame(np.eye(num_items,dtype=int), columns=np.arange(num_items))

In [26]:
items_values.to_csv(Path(export_dir, "items_values_Pinterest.csv"), index = False)

In [27]:
items_values.iloc[581][581]

1

In [28]:
items_values.index.values

array([   0,    1,    2, ..., 9636, 9637, 9638])

In [29]:
items_values_dict = {}
for i in range(items_values.shape[0]):
    items_values_dict[i] = items_values.iloc[i,:]

In [30]:
# Output doctionaries for further easier calculations
# Items mapping to one-hot encoded vectors
file_path = 'items_values_dict_Pinterest.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(items_values_dict, f)   

In [31]:
items_values_dict[3704]

0       0
1       0
2       0
3       0
4       0
       ..
9634    0
9635    0
9636    0
9637    0
9638    0
Name: 3704, Length: 9639, dtype: int64

In [32]:
data.user_id.unique()

array([    0,     1,     2, ..., 19152, 19153, 19154])

## Split to training and test set

In [33]:
random_state = 12

# The list of users IDs
users_indices = data.user_id.unique()

# Set the split ratio (80% for training, 20% for testing)
split_ratio = 0.8

# Calculate the split index
shuffled_users_ids = np.random.permutation(users_indices)
split_index = int(len(shuffled_users_ids) * split_ratio)

# Split the list of user IDs into training and testing sets
train_user_ids = shuffled_users_ids[:split_index]
test_user_ids = shuffled_users_ids[split_index:]

In [34]:
# Split the user hot encoding matrix into training and testing sets based on the selected user IDs
train_data = user_one_hot.loc[train_user_ids]
test_data = user_one_hot.loc[test_user_ids]

In [35]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9631,9632,9633,9634,9635,9636,9637,9638,user_id,y_positive
1890,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1890,7540
18396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18396,7965
16441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,16441,144
9199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9199,7133
18551,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18551,2903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8201,8891
10398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10398,6228
16564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,16564,7592
16781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,16781,3664


In [36]:
training_data = data[data.user_id.isin(train_user_ids)]

# Group by item_id and aggregate user_id values into a list
item_group = training_data.groupby("item_id")

items_data = pd.DataFrame(
    data={
        "item_id": list(item_group.groups.keys()),
        "users_ids": list(item_group.user_id.apply(list)),
    }    
)

In [37]:
items_data 

Unnamed: 0,item_id,users_ids
0,0,"[214, 301, 669, 720, 738, 831, 831, 1000, 1282..."
1,1,"[318, 387, 461, 605, 834, 1075, 1214, 1345, 14..."
2,2,"[56, 347, 365, 454, 465, 1011, 1544, 1545, 156..."
3,3,"[101, 222, 387, 936, 1599, 2038, 2686, 3588, 4..."
4,4,"[111, 246, 434, 491, 1181, 1193, 3599, 4211, 4..."
...,...,...
9561,9633,[18440]
9562,9635,"[18699, 18700, 18701, 18702, 18703, 18704]"
9563,9636,"[18755, 18757, 18760]"
9564,9637,[18778]


In [38]:
def create_user_based_Jaccard_sim():
    """
    calculate similarity between items based on users that consumed these items (Jaccard similarity)
    """
    item_similarities = {}
    for index1, row1 in items_data.iterrows():
        for index2, row2 in items_data.iterrows():
            if row1["item_id"]!= row2["item_id"]:
                similarity = jaccard_similarity(set(row1["users_ids"]), set(row2["users_ids"]))
            else:
                similarity = 1
            item_similarities[(row1["item_id"], row2["item_id"])] = similarity

    return item_similarities 

In [39]:
user_similarities_Jaccard = create_user_based_Jaccard_sim()

In [40]:
file_path = 'user_similarities_Jaccard_Pinterest.pkl'

# Open a file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(user_similarities_Jaccard, f)

"""
with open(file_path, 'rb') as f:
    user_similarities_Jaccard = pickle.load(f)
"""

"\nwith open(file_path, 'rb') as f:\n    user_similarities_Jaccard = pickle.load(f)\n"

#### Use MultiLabelBinarizer to encode the user IDs for each item into a one-hot matrix

In [41]:
mlb = MultiLabelBinarizer(classes=train_user_ids)  # Only include train_user_ids
item_one_hot = pd.DataFrame(
    mlb.fit_transform(items_data["users_ids"]),
    columns=mlb.classes_,
    index=items_data["item_id"]
)

In [42]:
len(train_user_ids)

15324

In [43]:
item_one_hot

Unnamed: 0_level_0,1890,18396,16441,9199,18551,10947,16119,15200,13673,18834,...,7925,8022,4696,5493,1433,8201,10398,16564,16781,1888
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9637,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
def item_user_based_cos_sim():
    """
    Item similarity based on cosine between user and item
    """
    return cosine_similarity(item_one_hot)

In [45]:
cosine_items = item_user_based_cos_sim()

In [46]:
cosine_items

array([[1.        , 0.01277753, 0.01906925, ..., 0.        , 0.        ,
        0.        ],
       [0.01277753, 1.        , 0.06091449, ..., 0.        , 0.        ,
        0.        ],
       [0.01906925, 0.06091449, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [47]:
cosine_items_dict = {}

# Loop through the rows and columns of the ndarray and add each element to the dictionary
for i in range(cosine_items.shape[0]):
    for j in range(cosine_items.shape[1]):
        cosine_items_dict[(i, j)] = cosine_items[i][j]

In [48]:
cosine_items_dict[(0,0)]

1.0000000000000007

In [49]:
file_path = 'cosine_items_Pinterest.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(cosine_items_dict, f)

In [50]:
cosine_items = cosine_items_dict

### Add negative examples to training data & Calculate the popularity of each item in the training set

In [51]:
popularity_dict = train_data.iloc[:,:-2].sum(axis=0).to_dict()

In [52]:
popularity_dict[3705] 

49

In [53]:
file_path ='pop_dict_Pinterest.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(popularity_dict, f)

In [54]:
np.sum(train_data.iloc[:,3704])

17

In [55]:
prob_dict = {}
for k, v in popularity_dict.items():
    prob_dict[k] = v / sum(popularity_dict.values())
print(len(prob_dict))

9639


In [56]:
file_path ='prob_dict_Pinterest.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(prob_dict, f)

In [57]:
sorted(prob_dict.items(), key=operator.itemgetter(1),reverse=True)[0]

(487, 0.0010195433883766918)

In [58]:
def get_negative_samples(numpy_arr,num):
    """
    Sample negative points for training
    """
    negative_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
        items_from_dict_keys = [d for d in popularity_dict.keys() if d in u]
        sum_popularity= 0
        for it in items_from_dict_keys:
            sum_popularity += popularity_dict[it] 
        items_probs = [popularity_dict[d]/sum_popularity for d in items_from_dict_keys]
        negative_samples = np.random.choice(items_from_dict_keys,size=num,replace=False,p=items_probs)
        if(num == 1):
            negative_values.append(int(negative_samples))
        else:
            negative_values.append(negative_samples) 
                               
    return negative_values   

In [59]:
y_negative = get_negative_samples(np.argwhere(train_data.iloc[:,:-2].to_numpy()==0), num=1)

In [60]:
train_data["y_negative"] = y_negative

In [61]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9632,9633,9634,9635,9636,9637,9638,user_id,y_positive,y_negative
1890,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1890,7540,926
18396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18396,7965,1164
16441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16441,144,5662
9199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9199,7133,4171
18551,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18551,2903,8003


In [62]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9632,9633,9634,9635,9636,9637,9638,user_id,y_positive,y_negative
1890,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1890,7540,926
18396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18396,7965,1164
16441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16441,144,5662
9199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9199,7133,4171
18551,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18551,2903,8003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8201,8891,5724
10398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10398,6228,2917
16564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16564,7592,1255
16781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16781,3664,1185


### Positive and Negative examples merged together

In [63]:
train_data_mixed = train_data.merge(train_data.loc[:,['user_id','y_positive','y_negative']].melt('user_id', value_name='y_values').replace({'y_positive': 1, 'y_negative': 0}), on="user_id").rename(columns={'variable': 'interaction'}).drop(['y_positive', 'y_negative'],axis=1)

In [64]:
train_data_mixed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9632,9633,9634,9635,9636,9637,9638,user_id,interaction,y_values
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1890,1,7540
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1890,0,926
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18396,1,7965
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18396,0,1164
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16441,1,144


In [65]:
train_data_mixed.to_csv(Path(export_dir,'train_data_Pinterest.csv'), index=False)

In [66]:
train_data_mixed.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9632,9633,9634,9635,9636,9637,9638,user_id,interaction,y_values
count,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,...,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0,30648.0
mean,0.008157,0.003198,0.005743,0.002219,0.001762,0.002676,0.006395,0.00633,0.002284,0.007244,...,0.0,6.5e-05,0.0,0.000392,0.000196,6.5e-05,6.5e-05,9576.615114,0.5,3766.537914
std,0.089949,0.056458,0.075563,0.047052,0.041939,0.051657,0.079715,0.07931,0.047737,0.084802,...,0.0,0.008078,0.0,0.019784,0.013991,0.008078,0.008078,5520.791361,0.500008,2446.59998
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4808.75,0.0,1623.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9549.5,0.5,3398.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14353.25,1.0,5745.25
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,19154.0,1.0,9635.0


In [67]:
test_data.to_csv(Path(export_dir, "test_data_Pinterest.csv"), index = False)

### Create tf_idf file upon training data interactions

In [68]:
train_array = train_data_mixed.to_numpy()
test_array = test_data.to_numpy()

In [69]:
# Tf/idf calculation
import numpy as np

train_data =(train_array[train_array[:,-2]==0][:,:-3]).T
test_data = (test_array[:,:-2]).T

# Compute the IDF scores for each item in the train data
num_docs = train_data.shape[0]
idf = np.log(num_docs / (np.sum(train_data, axis=0) + 1))

# Compute the TF-IDF scores for each item in the train data
tfidf_matrix = np.zeros(train_data.shape)
for i in range(train_data.shape[1]):
    tf = train_data[:, i] / np.sum(train_data[:, i])
    tfidf_matrix[:, i] = tf * idf[i]  

tf_idf_items_dict = {(i, j): tfidf_matrix[i, j] for i in range(tfidf_matrix.shape[0]) for j in range(tfidf_matrix.shape[1])}

In [70]:
file_path = 'tf_idf_items_Pinterest.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(tf_idf_items_dict, f)

## Recommender model training

In [71]:
class MLP_G(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MLP_G, self).__init__()
        self.linear_x = nn.Linear(input_size, hidden_size, bias = False)
        self.linear_y = nn.Linear(input_size, hidden_size, bias = False)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user, item):
        user_representation = self.linear_x(user.float())
        item_representation = self.linear_y(item.float())
        dot_prod = torch.matmul(user_representation, item_representation.T)
        dot_sigmoid = self.sigmoid(dot_prod)
        
        return dot_sigmoid

In [72]:
class Recommender_G(nn.Module):
    def __init__(self, num_items, hidden_size):
        super(Recommender_G, self).__init__()
        self.mlp = MLP_G(num_items, hidden_size).to(device)

    def forward(self, user_vector, item_vector):
        user_vector = user_vector.to(device)
        item_vector = item_vector.to(device)
        output = self.mlp(user_vector, item_vector)
        return output.to(device)

In [73]:
def get_top_k(user_vector, original_user_vector, num_items, model, top_k):
    item_prob_dict = {}
    user_tensor = torch.Tensor(user_vector).to(device)
    item_tensor = torch.FloatTensor(items_array).to(device)
    output_model = [float(i) for i in model(user_tensor, item_tensor).cpu().detach().numpy()]
    
    original_user_vector = np.array(original_user_vector.cpu())
    neg = np.ones_like(original_user_vector)- original_user_vector
    output = neg*output_model
    for i in range(len(output)):
        item_prob_dict[i]=output[i]

    sorted_items_by_prob  = sorted(item_prob_dict.items(), key=lambda item: item[1],reverse=True)

    return dict(sorted_items_by_prob[0:top_k])

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_dim = 20
print("num_users is ", num_users)
print("num_items is ", num_items)
recommender_model_g = Recommender_G(num_items, hidden_dim)
recommender_loss_g = nn.BCELoss()
recommender_optimizer_g = torch.optim.Adam(recommender_model_g.parameters(), lr=0.001)

num_users is  19155
num_items is  9639


In [75]:
train_losses = []
epochs = 10

for epoch in range(epochs):
    if epoch == 20:
        recommender_optimizer.lr = 0.0001
    train_loss = 0
    for i in range(train_array.shape[0]):
        item_id = train_array[i][-1]
        user_id = train_array[i][-3]
        item_vector = items_values_dict[item_id]
        user_vector = train_array[i][:-3]
        user_vector[item_id] = 0 
        interact = train_array[i][-2]
        
        user_tensor = torch.FloatTensor(user_vector).to(device)
        item_tensor = torch.FloatTensor(item_vector).to(device)
      
        interact_tensor = torch.FloatTensor([interact]).to(device)
        interact_tensor = torch.tensor(interact_tensor.item()).to(device)
        
        recommender_optimizer_g.zero_grad()
        recommender_output = (recommender_model_g(user_tensor, item_tensor)).to(device)
        rec_loss = recommender_loss_g(recommender_output, interact_tensor)
        
        train_loss+=rec_loss.item()
            
        rec_loss.backward()
        recommender_optimizer_g.step()
    train_losses.append(train_loss/train_array.shape[0])
    print(f"Epoch {epoch}, Train Loss {train_loss/train_array.shape[0]:.4f}")

  dot_prod = torch.matmul(user_representation, item_representation.T)


Epoch 0, Train Loss 0.6438
Epoch 1, Train Loss 0.3277
Epoch 2, Train Loss 0.1445
Epoch 3, Train Loss 0.0591
Epoch 4, Train Loss 0.0203
Epoch 5, Train Loss 0.0057
Epoch 6, Train Loss 0.0014
Epoch 7, Train Loss 0.0002
Epoch 8, Train Loss 0.0000
Epoch 9, Train Loss 0.0000


In [76]:
torch.save(recommender_model_g.state_dict(), Path(export_dir, 'recommender_model_Pinterest.pt'))

### Recommender Freezing

In [77]:
for param in recommender_model_g.parameters():
    param.requires_grad = False

In [78]:
# Get users vectors to create topk
unique_indices = np.unique(train_array[:,-3], return_index=True, axis=0)[1]

# create a new array with only the unique users
train_unique_arr = train_array[unique_indices, :]
items_array = items_values.to_numpy()

In [79]:
def create_topk_data(data_, rec_model, data_type = "train"):
    """
    Create top_k dictionary that contains user, item, score for the rec_model and is stored as (user_id, item_id, score) values
    """
    user_item_matrix = np.zeros((num_users, num_items))
    
    for i in range(data_.shape[0]):

        if(data_type == "train"):
            user_id = data_[i][-3]
            user_vector = data_[i][:-3]
        elif(data_type == "test"):
            user_id = data_[i][-2]
            user_vector = data_[i][:-2]
            
        user_tensor = torch.FloatTensor(user_vector).to(device)
            
        top_items = get_top_k(user_tensor, user_tensor, num_items, rec_model, num_items)
      
        for item_id in top_items.keys(): 
            user_item_matrix[user_id, item_id] = top_items[item_id]
        
        if(i % 100 == 0):
            print(i)
            
    return user_item_matrix

In [80]:
topk_train = create_topk_data(train_unique_arr, recommender_model_g, data_type="train")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300


In [81]:
topk_test = create_topk_data(test_array, recommender_model_g, data_type="test")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800


In [82]:
file_path= 'topk_train_Pinterest.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(topk_train, f)

In [83]:
file_path = 'topk_test_Pinterest.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(topk_test, f)