In [1]:
# import the dataset
import pandas as pd
products_df = pd.read_csv('products.csv')
ratings_df = pd.read_csv('ratings.csv')

In [2]:
print('The dimensions of products dataframe are:', products_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of products dataframe are: (60, 3) 
The dimensions of ratings dataframe are: (1000, 4)


In [3]:
products_df.head()

Unnamed: 0,productId,products,category
0,1,Iphone 13 Pro Max,flagship
1,2,Iphone 13 Pro,high_end
2,3,Iphone 13,high_end
3,4,Iphone 13 Mini,high_end
4,5,Iphone 12 Pro Max,high_end


In [4]:
ratings_df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,1,7,3,964982931.0
1,2,30,3,964982176.0
2,3,51,5,964984002.0
3,4,28,2,964982681.0
4,5,44,5,964984041.0


In [5]:
#Products ID to Product name mapping
products_names = products_df.set_index('productId')['products'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.productId.unique())
print("Number of unique users:", n_users)
print("Number of unique products:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
#("We have an incredibly sparse matrix to work with here.")
#("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
#("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
#("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 1000
Number of unique products: 60
The full rating matrix will have: 60000 elements.
----------
Number of ratings: 1000
Therefore:  1.6666666666666667 % of the matrix is filled.


In [6]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

#Matrix factorization is a class of collaborative filtering algorithms used in recommender systems. 
#Matrix factorization algorithms work by decomposing the user-item interaction matrix into the product of two lower dimensionality rectangular matrices.

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=4):
        super().__init__()
        
        # Create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # Create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # Matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [7]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and Product IDs
        users = ratings_df.userId.unique()
        products = ratings_df.productId.unique()
        
#self. is the "tag" method in Python to refer to instance attributes       

        #--- Producing new continuous IDs for Users and Products ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.productid2idx = {o:i for i,o in enumerate(products)}
        
        # Obtained continuous ID for Users and Products
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2productid = {i:o for o,i in self.productid2idx.items()}
        
        # Return the id from the indexed values as noted in the lambda function down below.
        self.ratings.productId = ratings_df.productId.apply(lambda x: self.productid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [8]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()
    
# The three steps to building a prototype: 
# 1) defining the model, 
# 2) defining the loss,
# 3) and picking an optimization technique. The latter two steps are largely built into PyTorch, so we’ll start with the hardest first.

# MSE loss (MEAN SQUARE ERROR: loss between the matrix factorization “prediction” and the actual user-item ratings)
loss_fn = torch.nn.MSELoss()

#adagrad_loss = torch.optim.Adagrad(model.parameters(), lr= 1e-3) #different optimization algorithm

#Me lr=1e-1 exoume to xamhlotero error sthn teleutaia epoch
# ADAM optimizier 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #learnig rate

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(1000, 8)
  (item_factors): Embedding(60, 8)
)
user_factors.weight tensor([[0.0127, 0.0428, 0.0190,  ..., 0.0065, 0.0216, 0.0149],
        [0.0124, 0.0076, 0.0061,  ..., 0.0237, 0.0024, 0.0069],
        [0.0490, 0.0101, 0.0342,  ..., 0.0058, 0.0262, 0.0269],
        ...,
        [0.0149, 0.0370, 0.0375,  ..., 0.0332, 0.0089, 0.0205],
        [0.0172, 0.0001, 0.0373,  ..., 0.0320, 0.0043, 0.0021],
        [0.0052, 0.0159, 0.0408,  ..., 0.0230, 0.0131, 0.0354]])
item_factors.weight tensor([[0.0071, 0.0324, 0.0195, 0.0218, 0.0406, 0.0470, 0.0402, 0.0003],
        [0.0373, 0.0270, 0.0373, 0.0344, 0.0038, 0.0155, 0.0060, 0.0059],
        [0.0245, 0.0197, 0.0399, 0.0465, 0.0228, 0.0116, 0.0037, 0.0117],
        [0.0374, 0.0493, 0.0290, 0.0427, 0.0068, 0.0233, 0.0184, 0.0484],
        [0.0053, 0.0083, 0.0249, 0.0425, 0.0386, 0.0060, 0.0314, 0.0336],
        [0.0093, 0.0369, 0.0333, 0.0338, 0.0087, 0.0197, 0.0248, 0.0200],

In [9]:
#TQDM is used for creating Progress Meters or Progress Bars in Python.
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.005945801734924
iter #1 Loss: 10.959268450737
iter #2 Loss: 10.968974947929382
iter #3 Loss: 10.905981063842773
iter #4 Loss: 10.856761932373047
iter #5 Loss: 10.880078673362732
iter #6 Loss: 10.853960394859314
iter #7 Loss: 10.811119079589844
iter #8 Loss: 10.778462409973145
iter #9 Loss: 10.723978400230408
iter #10 Loss: 10.691505312919617
iter #11 Loss: 10.704487085342407
iter #12 Loss: 10.631688117980957
iter #13 Loss: 10.545263528823853
iter #14 Loss: 10.520253539085388
iter #15 Loss: 10.473262071609497
iter #16 Loss: 10.388989567756653
iter #17 Loss: 10.303861618041992
iter #18 Loss: 10.248422384262085
iter #19 Loss: 10.227567315101624
iter #20 Loss: 10.12908399105072
iter #21 Loss: 9.993457913398743
iter #22 Loss: 9.945315480232239
iter #23 Loss: 9.878607153892517
iter #24 Loss: 9.771252751350403
iter #25 Loss: 9.659219145774841
iter #26 Loss: 9.530373811721802
iter #27 Loss: 9.466899156570435
iter #28 Loss: 9.342447519302368
iter #29 Loss: 9.209536254405975
ite

In [10]:
# By training the model, we will have tuned latent factors for products and users. 
# Latent variable is a variable that cannot be observed. The presence of latent variables, however, can be detected by their effects on variables that are observable

c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.4058, 0.4196, 0.4025,  ..., 0.3752, 0.3943, 0.4108],
        [0.3888, 0.3902, 0.3833,  ..., 0.4143, 0.3998, 0.4027],
        [0.5033, 0.4673, 0.4818,  ..., 0.4678, 0.4955, 0.4884],
        ...,
        [0.2699, 0.3098, 0.3043,  ..., 0.3083, 0.2702, 0.2805],
        [0.4427, 0.4294, 0.4772,  ..., 0.4657, 0.4247, 0.4445],
        [0.1344, 0.1395, 0.1567,  ..., 0.1492, 0.1313, 0.1527]],
       device='cuda:0')
item_factors.weight tensor([[0.9098, 0.9366, 0.9201, 0.9074, 0.9548, 0.9449, 0.9352, 0.9136],
        [0.9284, 0.9113, 0.9208, 0.9377, 0.8994, 0.8837, 0.8912, 0.8901],
        [1.0110, 1.0201, 1.0365, 1.0472, 1.0051, 0.9944, 0.9923, 1.0040],
        [0.9709, 0.9624, 0.9457, 0.9578, 0.9320, 0.9506, 0.9421, 0.9675],
        [0.9616, 1.0077, 0.9957, 1.0287, 1.0025, 0.9747, 1.0042, 1.0174],
        [0.8592, 0.8851, 0.9049, 0.8781, 0.8583, 0.8758, 0.8863, 0.8742],
        [0.9861, 1.0129, 1.0099, 1.0086, 1.0011, 1.0302, 1.0241, 0.9872],
        [0.8987, 0.8

In [11]:
trained_products_embeddings = model.item_factors.weight.data.cpu().numpy()

In [12]:
len(trained_products_embeddings) # Unique products factor weights

60

In [13]:
from sklearn.cluster import KMeans
# Fit the clusters based on the products weights
kmeans = KMeans(n_clusters=4, random_state=0).fit(trained_products_embeddings) #Orismos CLUSTERS

In [14]:
'''It can be seen here that the products that are in the same cluster tend to have
similar categories. #"THA EPREPE NA EXEI TIS IDIES KATHGORIES" 
Also note that the algorithm is unfamiliar with the product name
and only obtained the relationships by looking at the numbers representing how
users have responded to the products selections.'''

#Epileksame na orisoume 4 clusters epeidh einai 4 kai oi kathgories twn proiontwn mas (Flagship, High End, Mid Range & Low End)

for cluster in range(4):
  print("Cluster #{}".format(cluster))
  prods = []
  for prodidx in np.where(kmeans.labels_ == cluster)[0]:
    prodid = train_set.idx2productid[prodidx]
    rat_count = ratings_df.loc[ratings_df['productId']==prodid].count()[0]
    prods.append((products_names[prodid], rat_count))
  for prod in sorted(prods, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", prod[0])

Cluster #0
	 Iphone 12 Mini
	 Huawei P50
	 One Plus 8 Pro
	 Samsung Galaxy S20 Plus
	 Samsung Galaxy S21 Ultra
	 One Plus 8T
	 Realme 9 Pro
	 One Plus Nord 2
	 Xiaomi Redmi Note 10
	 Google Pixel 5
Cluster #1
	 Google Pixel 6a
	 Realme 9 Pro Plus
	 Realme GT Neo 2
	 Huawei P40 Pro
	 Realme Gt Neo 3 Pro
	 Samsung Galaxy S22 Ultra
	 Xiaomi 12 Pro
	 One Plus 9 Pro
Cluster #2
	 Samsung Galaxy S21
	 Xiaomi Poco M4 Pro
	 Xiaomi Redmi Note 11 Pro
	 Iphone 11 Pro
	 Xiaomi 12X
	 Huawei P40
	 Google Pixel 5a
	 Samsung Galaxy S20 Fe
	 Xiaomi Poco F3
	 Huawei P40 lite
Cluster #3
	 Iphone 13 Mini
	 Iphone 13 Pro
	 Huawei 8i
	 Xiaomi Redmi Note 11
	 Samsung Galaxy S22 Plus
	 Iphone 11 Pro Max
	 Xiaomi Redmi Note 9 Pro
	 Realme GT 2
	 Huawei P30 Pro
	 Iphone 13 Pro Max
