In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn import CrossEntropyLoss

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse
import os
from os import path
from pathlib import Path
import pickle
import operator

SEED = 3
np.random.seed(SEED)

## Load data

In [3]:
ML_DIR = "Data"
export_dir = Path(os.getcwd())
files_path = Path(export_dir.parent, ML_DIR)

In [4]:
print(export_dir)

/media/dready/Data/dready/LXR/Data_preprocessing


In [5]:
data = pd.read_csv(Path(files_path, "ratings.dat"), sep="::", engine="python",
                   names=["user_id_original", "item_id_original", "rating", "timestamp"])

In [6]:
data.head()

Unnamed: 0,user_id_original,item_id_original,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Getting some data statistics on users' history

In [7]:
sum(data.groupby('user_id_original')['item_id_original'].count() < 100)

3095

In [8]:
np.mean(data.groupby('user_id_original')['item_id_original'].count())

165.5975165562914

In [9]:
# Convert the ratings to binary values (1 if rating exists, 0 otherwise)
data["rating"] = data["rating"].apply(lambda x: 1 if x > 0 else 0)


# Encode target values
data["user_id"] = LabelEncoder().fit_transform(data.user_id_original)
data["item_id"] = LabelEncoder().fit_transform(data.item_id_original)

# Get the number of users and items in the dataset
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

#### N unique items

In [10]:
data.item_id.nunique()

3706

## Preprocessing for baselines calculation

In [88]:
def jaccard_similarity(set1, set2):
    """
    Calculate the Jaccard similarity between two sets 
    """
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

In [11]:
user_group = data[["user_id","item_id"]].groupby(data.user_id)

users_data = pd.DataFrame(
    data={
        "user_id": list(user_group.groups.keys()),
        "item_ids": list(user_group.item_id.apply(list)),
    }    
)

In [12]:
mlb = MultiLabelBinarizer()
user_one_hot = pd.DataFrame(mlb.fit_transform(users_data["item_ids"]),columns=mlb.classes_, index=users_data["item_ids"].index)

In [13]:
user_one_hot["user_id"] = users_data["user_id"]

In [14]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3697,3698,3699,3700,3701,3702,3703,3704,3705,user_id
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6035
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6036
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6037
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6038


In [15]:
user_one_hot.iloc[:,:-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Positive samples for training

In [16]:
def get_y_values(numpy_arr):
    """
    Sample items from consumed items to predict (positive examples)
    """
    y_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
          y_values.append(int(np.random.choice(u[:-1],1,replace=False)))
    return y_values    

In [17]:
y_indices = get_y_values(np.argwhere(user_one_hot.to_numpy()>0))

In [18]:
len(y_indices)

6040

In [19]:
user_one_hot["y_positive"] = y_indices

In [20]:
user_one_hot

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3698,3699,3700,3701,3702,3703,3704,3705,user_id,y_positive
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,708
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1550
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,1327
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2173
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,6035,1581
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6036,1537
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6037,225
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6038,997


#### Create representation of each item as a binary vector

In [21]:
items_values = pd.DataFrame(np.eye(num_items,dtype=int), columns=np.arange(num_items))

In [22]:
items_values.to_csv(Path(export_dir, "items_values.csv"), index = False)

In [23]:
items_values.iloc[581][581]

1

In [None]:
items_values.index.values

In [None]:
items_values_dict = {}
for i in range(items_values.shape[0]):
    items_values_dict[i] = items_values.iloc[i,:]

In [None]:
# Output doctionaries for further easier calculations
# Items mapping to one-hot encoded vectors
file_path = 'items_values_dict_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(items_values_dict, f)   

In [None]:
items_values_dict[3704]

In [None]:
data.user_id.unique()

## Split to training and test set

In [None]:
random_state = 12

# The list of users IDs
users_indices = data.user_id.unique()

# Set the split ratio (80% for training, 20% for testing)
split_ratio = 0.8

# Calculate the split index
shuffled_users_ids = np.random.permutation(users_indices)
split_index = int(len(shuffled_users_ids) * split_ratio)

# Split the list of user IDs into training and testing sets
train_user_ids = shuffled_users_ids[:split_index]
test_user_ids = shuffled_users_ids[split_index:]

In [None]:
# Split the user hot encoding matrix into training and testing sets based on the selected user IDs
train_data = user_one_hot.loc[train_user_ids]
test_data = user_one_hot.loc[test_user_ids]

In [None]:
train_data

In [None]:
training_data = data[data.user_id.isin(train_user_ids)]

# Group by item_id and aggregate user_id values into a list
item_group = training_data.groupby("item_id")

items_data = pd.DataFrame(
    data={
        "item_id": list(item_group.groups.keys()),
        "users_ids": list(item_group.user_id.apply(list)),
    }    
)

In [None]:
items_data 

In [None]:
def create_user_based_Jaccard_sim():
    """
    calculate similarity between items based on users that consumed these items (Jaccard similarity)
    """
    item_similarities = {}
    for index1, row1 in items_data.iterrows():
        for index2, row2 in items_data.iterrows():
            if row1["item_id"]!= row2["item_id"]:
                similarity = jaccard_similarity(set(row1["users_ids"]), set(row2["users_ids"]))
            else:
                similarity = 1
            item_similarities[(row1["item_id"], row2["item_id"])] = similarity

    return item_similarities 

In [None]:
user_similarities_Jaccard = create_user_based_Jaccard_sim()

In [None]:
file_path = 'user_similarities_Jaccard_ML1.pkl'

# Open a file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(user_similarities_Jaccard, f)

"""
with open(file_path, 'rb') as f:
    user_similarities_Jaccard = pickle.load(f)
"""

#### Use MultiLabelBinarizer to encode the user IDs for each item into a one-hot matrix

In [None]:
mlb = MultiLabelBinarizer(classes=train_user_ids)  # Only include train_user_ids
item_one_hot = pd.DataFrame(
    mlb.fit_transform(items_data["users_ids"]),
    columns=mlb.classes_,
    index=items_data["item_id"]
)

In [None]:
len(train_user_ids)

In [None]:
item_one_hot

In [None]:
def item_user_based_cos_sim():
    """
    Item similarity based on cosine between user and item
    """
    return cosine_similarity(item_one_hot)

In [None]:
cosine_items = item_user_based_cos_sim()

In [None]:
cosine_items

In [None]:
cosine_items_dict = {}

# Loop through the rows and columns of the ndarray and add each element to the dictionary
for i in range(cosine_items.shape[0]):
    for j in range(cosine_items.shape[1]):
        cosine_items_dict[(i, j)] = cosine_items[i][j]

In [None]:
cosine_items_dict[(0,0)]

In [None]:
file_path = 'cosine_items_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(cosine_items_dict, f)

In [None]:
cosine_items = cosine_items_dict

### Add negative examples to training data & Calculate the popularity of each item in the training set

In [None]:
popularity_dict = train_data.iloc[:,:-2].sum(axis=0).to_dict()

In [None]:
popularity_dict[3705] 

In [None]:
file_path ='pop_dict.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(popularity_dict, f)

In [None]:
np.sum(train_data.iloc[:,3704])

In [None]:
prob_dict = {}
for k, v in popularity_dict.items():
    prob_dict[k] = v / sum(popularity_dict.values())
print(len(prob_dict))

In [None]:
file_path ='prob_dict.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(prob_dict, f)

In [None]:
sorted(prob_dict.items(), key=operator.itemgetter(1),reverse=True)[0]

In [None]:
def get_negative_samples(numpy_arr,num):
    """
    Sample negative points for training
    """
    negative_values = []
    users_arr = np.split(numpy_arr[:, 1], np.unique(numpy_arr[:, 0], return_index=True)[1][1:])
    for u in users_arr:
        items_from_dict_keys = [d for d in popularity_dict.keys() if d in u]
        sum_popularity= 0
        for it in items_from_dict_keys:
            sum_popularity += popularity_dict[it] 
        items_probs = [popularity_dict[d]/sum_popularity for d in items_from_dict_keys]
        negative_samples = np.random.choice(items_from_dict_keys,size=num,replace=False,p=items_probs)
        if(num == 1):
            negative_values.append(int(negative_samples))
        else:
            negative_values.append(negative_samples) 
                               
    return negative_values   

In [None]:
y_negative = get_negative_samples(np.argwhere(train_data.iloc[:,:-2].to_numpy()==0), num=1)

In [None]:
train_data["y_negative"] = y_negative

In [None]:
train_data.head()

In [None]:
train_data

### Positive and Negative examples merged together

In [None]:
train_data_mixed = train_data.merge(train_data.loc[:,['user_id','y_positive','y_negative']].melt('user_id', value_name='y_values').replace({'y_positive': 1, 'y_negative': 0}), on="user_id").rename(columns={'variable': 'interaction'}).drop(['y_positive', 'y_negative'],axis=1)

In [None]:
train_data_mixed.head()

In [None]:
train_data_mixed.to_csv(Path(export_dir,'train_data_mixed.csv'), index=False)

In [None]:
train_data_mixed.describe()

In [None]:
test_data.to_csv(Path(export_dir, "test_data.csv"), index = False)

### Create tf_idf file upon training data interactions

In [None]:
train_array = train_data_mixed.to_numpy()
test_array = test_data.to_numpy()

In [None]:
# Tf/idf calculation
import numpy as np

train_data =(train_array[train_array[:,-2]==0][:,:-3]).T
test_data = (test_array[:,:-2]).T

# Compute the IDF scores for each item in the train data
num_docs = train_data.shape[0]
idf = np.log(num_docs / (np.sum(train_data, axis=0) + 1))

# Compute the TF-IDF scores for each item in the train data
tfidf_matrix = np.zeros(train_data.shape)
for i in range(train_data.shape[1]):
    tf = train_data[:, i] / np.sum(train_data[:, i])
    tfidf_matrix[:, i] = tf * idf[i]  

tf_idf_items_dict = {(i, j): tfidf_matrix[i, j] for i in range(tfidf_matrix.shape[0]) for j in range(tfidf_matrix.shape[1])}

In [None]:
file_path = 'tf_idf_items_ML1.pkl'

# Open the file in write binary mode and use pickle.dump to save the dictionary
with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(tf_idf_items_dict, f)

## Recommender model training

In [None]:
class MLP_G(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MLP_G, self).__init__()
        self.linear_x = nn.Linear(input_size, hidden_size, bias = False)
        self.linear_y = nn.Linear(input_size, hidden_size, bias = False)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user, item):
        user_representation = self.linear_x(user.float())
        item_representation = self.linear_y(item.float())
        dot_prod = torch.matmul(user_representation, item_representation.T)
        dot_sigmoid = self.sigmoid(dot_prod)
        
        return dot_sigmoid

In [None]:
class Recommender_G(nn.Module):
    def __init__(self, num_items, hidden_size):
        super(Recommender_G, self).__init__()
        self.mlp = MLP_G(num_items, hidden_size).to(device)

    def forward(self, user_vector, item_vector):
        user_vector = user_vector.to(device)
        item_vector = item_vector.to(device)
        output = self.mlp(user_vector, item_vector)
        return output.to(device)

In [None]:
def get_top_k(user_vector, original_user_vector, num_items, model, top_k):
    item_prob_dict = {}
    user_tensor = torch.Tensor(user_vector).to(device)
    item_tensor = torch.FloatTensor(items_array).to(device)
    output_model = [float(i) for i in model(user_tensor, item_tensor).cpu().detach().numpy()]
    
    original_user_vector = np.array(original_user_vector.cpu())
    neg = np.ones_like(original_user_vector)- original_user_vector
    output = neg*output_model
    for i in range(len(output)):
        item_prob_dict[i]=output[i]

    sorted_items_by_prob  = sorted(item_prob_dict.items(), key=lambda item: item[1],reverse=True)

    return dict(sorted_items_by_prob[0:top_k])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_dim = 20
print("num_users is ", num_users)
print("num_items is ", num_items)
recommender_model_g = Recommender_G(num_items, hidden_dim)
recommender_loss_g = nn.BCELoss()
recommender_optimizer_g = torch.optim.Adam(recommender_model_g.parameters(), lr=0.001)

In [None]:
train_losses = []
epochs = 10

for epoch in range(epochs):
    if epoch == 20:
        recommender_optimizer.lr = 0.0001
    train_loss = 0
    for i in range(train_array.shape[0]):
        item_id = train_array[i][-1]
        user_id = train_array[i][-3]
        item_vector = items_values_dict[item_id]
        user_vector = train_array[i][:-3]
        user_vector[item_id] = 0 
        interact = train_array[i][-2]
        
        user_tensor = torch.FloatTensor(user_vector).to(device)
        item_tensor = torch.FloatTensor(item_vector).to(device)
      
        interact_tensor = torch.FloatTensor([interact]).to(device)
        interact_tensor = torch.tensor(interact_tensor.item()).to(device)
        
        recommender_optimizer_g.zero_grad()
        recommender_output = (recommender_model_g(user_tensor, item_tensor)).to(device)
        rec_loss = recommender_loss_g(recommender_output, interact_tensor)
        
        train_loss+=rec_loss.item()
            
        rec_loss.backward()
        recommender_optimizer_g.step()
    train_losses.append(train_loss/train_array.shape[0])
    print(f"Epoch {epoch}, Train Loss {train_loss/train_array.shape[0]:.4f}")

In [None]:
torch.save(recommender_model_g.state_dict(), Path(export_dir, 'recommender_model.pt'))

### Recommender Freezing

In [None]:
for param in recommender_model_g.parameters():
    param.requires_grad = False

In [None]:
# Get users vectors to create topk
unique_indices = np.unique(train_array[:,-3], return_index=True, axis=0)[1]

# create a new array with only the unique users
train_unique_arr = train_array[unique_indices, :]
items_array = items_values.to_numpy()

In [None]:
def create_topk_data(data_, rec_model, data_type = "train"):
    """
    Create top_k dictionary that contains user, item, score for the rec_model and is stored as (user_id, item_id, score) values
    """
    user_item_matrix = np.zeros((num_users, num_items))
    
    for i in range(data_.shape[0]):

        if(data_type == "train"):
            user_id = data_[i][-3]
            user_vector = data_[i][:-3]
        elif(data_type == "test"):
            user_id = data_[i][-2]
            user_vector = data_[i][:-2]
            
        user_tensor = torch.FloatTensor(user_vector).to(device)
            
        top_items = get_top_k(user_tensor, user_tensor, num_items, rec_model, num_items)
      
        for item_id in top_items.keys(): 
            user_item_matrix[user_id, item_id] = top_items[item_id]
        
        if(i % 100 == 0):
            print(i)
            
    return user_item_matrix

In [None]:
topk_train = create_topk_data(train_unique_arr, recommender_model_g, data_type="train")

In [None]:
topk_test = create_topk_data(test_array, recommender_model_g, data_type="test")

In [None]:
file_path= 'topk_train.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(topk_train, f)

In [None]:
file_path = 'topk_test.pkl'

with open(Path(export_dir, file_path), 'wb') as f:
    pickle.dump(topk_test, f)