In [38]:
# import the dataset
import pandas as pd
products_df = pd.read_csv('products.csv')
ratings_df = pd.read_csv('ratings.csv')

In [39]:
print('The dimensions of products dataframe are:', products_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of products dataframe are: (60, 3) 
The dimensions of ratings dataframe are: (1000, 4)


In [40]:
products_df.head()

Unnamed: 0,productId,products,category
0,1,Iphone 13 Pro Max,flagship
1,2,Iphone 13 Pro,high end
2,3,Iphone 13,high end
3,4,Iphone 13 Mini,high end
4,5,Iphone 12 Pro Max,high end


In [41]:
ratings_df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,1,7,3,964982931.0
1,2,30,3,964982176.0
2,3,51,5,964984002.0
3,4,28,2,964982681.0
4,5,44,5,964984041.0


In [52]:
#Products ID to Product name mapping
products_names = products_df.set_index('productId')['products'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.productId.unique())
print("Number of unique users:", n_users)
print("Number of unique products:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
#("We have an incredibly sparse matrix to work with here.")
#("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
#("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
#("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 1000
Number of unique products: 60
The full rating matrix will have: 60000 elements.
----------
Number of ratings: 1000
Therefore:  1.6666666666666667 % of the matrix is filled.


In [53]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

#Matrix factorization is a class of collaborative filtering algorithms used in recommender systems. 
#Matrix factorization algorithms work by decomposing the user-item interaction matrix into the product of two lower dimensionality rectangular matrices.

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=4):
        super().__init__()
        
        # Create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # Create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # Matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [54]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and Product IDs
        users = ratings_df.userId.unique()
        products = ratings_df.productId.unique()
        
#self. is the "tag" method in Python to refer to instance attributes       

        #--- Producing new continuous IDs for Users and Products ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.productid2idx = {o:i for i,o in enumerate(products)}
        
        # Obtained continuous ID for Users and Products
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2productid = {i:o for o,i in self.productid2idx.items()}
        
        # Return the id from the indexed values as noted in the lambda function down below.
        self.ratings.productId = ratings_df.productId.apply(lambda x: self.productid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [55]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()
    
# The three steps to building a prototype: 
# 1) defining the model, 
# 2) defining the loss,
# 3) and picking an optimization technique. The latter two steps are largely built into PyTorch, so we’ll start with the hardest first.

# MSE loss (MEAN SQUARE ERROR: loss between the matrix factorization “prediction” and the actual user-item ratings)
loss_fn = torch.nn.MSELoss()

#adagrad_loss = torch.optim.Adagrad(model.parameters(), lr= 1e-3) #different optimization algorithm

#Me lr=1e-1 exoume to xamhlotero error sthn teleutaia epoch
# ADAM optimizier 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #learnig rate

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(1000, 8)
  (item_factors): Embedding(60, 8)
)
user_factors.weight tensor([[0.0297, 0.0207, 0.0133,  ..., 0.0158, 0.0200, 0.0463],
        [0.0192, 0.0017, 0.0273,  ..., 0.0381, 0.0341, 0.0007],
        [0.0151, 0.0161, 0.0272,  ..., 0.0160, 0.0493, 0.0273],
        ...,
        [0.0052, 0.0275, 0.0301,  ..., 0.0157, 0.0305, 0.0270],
        [0.0110, 0.0086, 0.0484,  ..., 0.0110, 0.0239, 0.0314],
        [0.0014, 0.0197, 0.0261,  ..., 0.0357, 0.0359, 0.0318]])
item_factors.weight tensor([[4.0732e-02, 3.1435e-02, 2.8046e-02, 1.1894e-02, 6.8243e-03, 4.8997e-02,
         3.3773e-02, 4.2094e-02],
        [2.8347e-02, 2.2862e-02, 2.9277e-02, 1.1884e-02, 3.8806e-02, 9.0326e-03,
         2.0869e-03, 3.6922e-02],
        [2.7217e-02, 1.8259e-02, 2.5954e-04, 3.1756e-04, 2.0218e-04, 3.5067e-02,
         2.5417e-02, 4.6746e-02],
        [4.6985e-02, 2.5058e-03, 2.5460e-02, 3.2716e-02, 1.8849e-03, 3.7161e-02,
         2.4927e-

In [56]:
#TQDM is used for creating Progress Meters or Progress Bars in Python.
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 10.980419397354126
iter #1 Loss: 10.958233714103699
iter #2 Loss: 10.950667023658752
iter #3 Loss: 10.890259623527527
iter #4 Loss: 10.924445867538452
iter #5 Loss: 10.874897122383118
iter #6 Loss: 10.855423092842102
iter #7 Loss: 10.827425599098206
iter #8 Loss: 10.814464688301086
iter #9 Loss: 10.732796907424927
iter #10 Loss: 10.722390413284302
iter #11 Loss: 10.689422488212585
iter #12 Loss: 10.620305180549622
iter #13 Loss: 10.58441686630249
iter #14 Loss: 10.527076005935669
iter #15 Loss: 10.466491937637329
iter #16 Loss: 10.389429807662964
iter #17 Loss: 10.35870087146759
iter #18 Loss: 10.245766043663025
iter #19 Loss: 10.163869976997375
iter #20 Loss: 10.091012954711914
iter #21 Loss: 10.048941612243652
iter #22 Loss: 9.956334114074707
iter #23 Loss: 9.842539191246033
iter #24 Loss: 9.746269345283508
iter #25 Loss: 9.639029383659363
iter #26 Loss: 9.528591394424438
iter #27 Loss: 9.450729370117188
iter #28 Loss: 9.366801738739014
iter #29 Loss: 9.239516258239746


In [57]:
# By training the model, we will have tuned latent factors for products and users. 
# Latent variable is a variable that cannot be observed. The presence of latent variables, however, can be detected by their effects on variables that are observable

c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.3957, 0.3917, 0.3860,  ..., 0.3789, 0.3899, 0.4121],
        [0.3979, 0.3839, 0.4057,  ..., 0.4276, 0.4277, 0.3748],
        [0.4691, 0.4743, 0.4976,  ..., 0.4658, 0.5033, 0.4715],
        ...,
        [0.2567, 0.2852, 0.2852,  ..., 0.2828, 0.2983, 0.2823],
        [0.4479, 0.4365, 0.4735,  ..., 0.4382, 0.4446, 0.4648],
        [0.1302, 0.1446, 0.1558,  ..., 0.1555, 0.1648, 0.1627]],
       device='cuda:0')
item_factors.weight tensor([[0.9749, 0.9439, 0.9556, 0.9220, 0.9097, 0.9795, 0.9213, 0.9620],
        [0.8993, 0.8945, 0.8935, 0.8887, 0.9191, 0.8943, 0.8823, 0.9198],
        [1.0071, 0.9963, 0.9767, 0.9547, 0.9707, 1.0035, 0.9853, 1.0245],
        [0.9977, 0.9350, 0.9550, 0.9528, 0.9340, 0.9759, 0.9788, 0.9343],
        [0.9935, 1.0159, 0.9914, 0.9861, 0.9746, 0.9786, 0.9792, 1.0077],
        [0.8568, 0.9053, 0.8591, 0.9096, 0.8589, 0.9083, 0.8791, 0.8815],
        [0.9815, 0.9985, 0.9922, 0.9969, 0.9877, 0.9880, 0.9700, 1.0177],
        [0.8966, 0.8

In [58]:
trained_products_embeddings = model.item_factors.weight.data.cpu().numpy()

In [59]:
len(trained_products_embeddings) # Unique products factor weights

60

In [60]:
from sklearn.cluster import KMeans
# Fit the clusters based on the products weights
kmeans = KMeans(n_clusters=4, random_state=0).fit(trained_products_embeddings) #Orismos CLUSTERS

In [61]:
'''It can be seen here that the products that are in the same cluster tend to have
similar categories. #"THA EPREPE NA EXEI TIS IDIES KATHGORIES" 
Also note that the algorithm is unfamiliar with the product name
and only obtained the relationships by looking at the numbers representing how
users have responded to the products selections.'''

#Epileksame na orisoume 4 clusters epeidh einai 4 kai oi kathgories twn proiontwn mas (Flagship, High End, Mid Range & Low End)

for cluster in range(4):
  print("Cluster #{}".format(cluster))
  prods = []
  for prodidx in np.where(kmeans.labels_ == cluster)[0]:
    prodid = train_set.idx2productid[prodidx]
    rat_count = ratings_df.loc[ratings_df['productId']==prodid].count()[0]
    prods.append((products_names[prodid], rat_count))
  for prod in sorted(prods, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", prod[0])

Cluster #0
	 Samsung Galaxy S21
	 Xiaomi Poco M4 Pro
	 Xiaomi Redmi Note 11 Pro
	 Iphone 12 Mini
	 Huawei P50
	 Iphone 11 Pro
	 Xiaomi 12X
	 Huawei P40
	 Samsung Galaxy S21 Ultra
	 Google Pixel 5a
Cluster #1
	 Iphone 13 Mini
	 Iphone 13 Pro
	 Huawei 8i
	 Xiaomi Redmi Note 11
	 Samsung Galaxy S22 Plus
	 Xiaomi Redmi Note 9 Pro
	 Realme GT 2
	 Huawei P30 Pro
	 Realme 8 Pro
	 Relame GT
Cluster #2
	 One Plus 8 Pro
	 Samsung Galaxy S20 Plus
	 One Plus 8T
	 Realme 9 Pro
	 One Plus Nord 2
	 Google Pixel 5
	 Iphone 12 Pro
	 Xiaomi 12
	 Samsung Galaxy S20 Ultra
	 Realme Gt Master Edition
Cluster #3
	 Iphone 11 Pro Max
	 Google Pixel 6a
	 Realme 9 Pro Plus
	 Realme GT Neo 2
	 Huawei P40 Pro
	 Realme Narzo
	 Samsung Galaxy S22 Ultra
	 Xiaomi 12 Pro
	 One Plus 9 Pro
