In [1]:
# import the dataset for our needs 
import pandas as pd

#We import the csv 
products_df = pd.read_csv('products.csv')
ratings_df = pd.read_csv('ratings_100k.csv')

In [2]:
#We inform the costumer about the dimensions of the dataframes 
print('The dimensions of products dataframe are:', products_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of products dataframe are: (60, 3) 
The dimensions of ratings dataframe are: (100000, 4)


In [3]:
products_df.head()

Unnamed: 0,productId,products,category
0,1,Iphone 13 Pro Max,flagship
1,2,Iphone 13 Pro,high_end
2,3,Iphone 13,high_end
3,4,Iphone 13 Mini,high_end
4,5,Iphone 12 Pro Max,high_end


In [4]:
ratings_df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,1,7,3,964982931
1,2,30,3,964982176
2,3,51,5,964984002
3,4,28,2,964982681
4,5,44,5,964984041


In [5]:
#Products ID to Product name mapping
products_names = products_df.set_index('productId')['products'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.productId.unique())
print("Number of unique users:", n_users)
print("Number of unique products:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
#"We have a sparse matrix to work with here
#"And as the number of users and products grow, the number of elements will increase by n*2"
#"You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a issue."
#"One advantage here is that matrix factorization can realize the rating matrix implicitly, 
# thus we don't need all the data to work with "

Number of unique users: 100000
Number of unique products: 60
The full rating matrix will have: 6000000 elements.
----------
Number of ratings: 100000
Therefore:  1.6666666666666667 % of the matrix is filled.


In [6]:
#torch library provides a wide range of algorithms for deep learning
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

#Matrix factorization is a class of collaborative filtering algorithms used in a few recommender systems. 
#Matrix factorization algorithms work by decomposing the user-item interaction matrix into the product of two lower 
#dimensionality rectangular matrices.

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=4):
        super().__init__()
        
        # Create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # Create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # Matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [7]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
# package that helps transform your data to machine learning readiness
from torch.utils.data import DataLoader 

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and Product IDs
        users = ratings_df.userId.unique()
        products = ratings_df.productId.unique()
        
#self. is the "tag" method in Python to refer to instance attributes       

        #--- Producing new continuous IDs for Users and Products ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.productid2idx = {o:i for i,o in enumerate(products)}
        
        # Obtained continuous ID for Users and Products
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2productid = {i:o for o,i in self.productid2idx.items()}
        
        # Return the id from the indexed values as noted in the lambda function down below.
        self.ratings.productId = ratings_df.productId.apply(lambda x: self.productid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [8]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()
    
# The three steps to build a prototype: 
# 1) defining the model, 
# 2) defining the loss,
# 3) and picking an optimization technique. The latter two steps are largely built into PyTorch, so we’ll start with the hardest first.

# MSE loss (MEAN SQUARE ERROR: loss between the matrix factorization “prediction” and the actual user-item ratings)
loss_fn = torch.nn.MSELoss()

#adagrad_loss = torch.optim.Adagrad(model.parameters(), lr= 1e-3) #different optimization algorithm

#With lr=1e-1 we have the lowest error at the last epoch
# ADAM optimizier 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #learnig rate

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(100000, 8)
  (item_factors): Embedding(60, 8)
)
user_factors.weight tensor([[0.0236, 0.0346, 0.0169,  ..., 0.0424, 0.0201, 0.0100],
        [0.0462, 0.0436, 0.0343,  ..., 0.0172, 0.0059, 0.0200],
        [0.0478, 0.0220, 0.0352,  ..., 0.0137, 0.0282, 0.0427],
        ...,
        [0.0331, 0.0412, 0.0322,  ..., 0.0159, 0.0378, 0.0171],
        [0.0010, 0.0479, 0.0409,  ..., 0.0202, 0.0461, 0.0434],
        [0.0154, 0.0329, 0.0344,  ..., 0.0384, 0.0483, 0.0142]])
item_factors.weight tensor([[3.1374e-02, 1.1103e-02, 1.3431e-02, 6.5954e-03, 4.6274e-02, 2.2526e-02,
         1.0400e-02, 3.4209e-02],
        [3.7059e-02, 1.9746e-02, 1.4666e-02, 2.1866e-02, 4.3412e-03, 3.6595e-02,
         2.5914e-02, 4.8392e-02],
        [4.3732e-02, 2.2399e-02, 5.1709e-03, 1.7975e-02, 4.8494e-02, 2.2326e-02,
         2.9496e-02, 8.9814e-04],
        [2.9339e-02, 1.9221e-02, 2.7852e-02, 8.7793e-03, 4.6504e-02, 4.8392e-02,
         1.2093

In [9]:
#TQDM is used for creating Progress Meters or Progress Bars in Python.
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 10.612371312992652
iter #1 Loss: 9.105746975945085
iter #2 Loss: 6.3355094002335886
iter #3 Loss: 3.4893522128424683
iter #4 Loss: 1.4786324491128897
iter #5 Loss: 0.48493406896853386
iter #6 Loss: 0.13831525898116934
iter #7 Loss: 0.06237101252369411
iter #8 Loss: 0.06050101955376013
iter #9 Loss: 0.06650430211306685
iter #10 Loss: 0.07306791317013218
iter #11 Loss: 0.07685987026814152
iter #12 Loss: 0.08791658001215867
iter #13 Loss: 0.10127077803320592
iter #14 Loss: 0.10284592001639364
iter #15 Loss: 0.08925075715650684
iter #16 Loss: 0.08453292143352502
iter #17 Loss: 0.08543732386949422
iter #18 Loss: 0.08750321596975216
iter #19 Loss: 0.08612327395802569
iter #20 Loss: 0.08404774722807548
iter #21 Loss: 0.08163805295477437
iter #22 Loss: 0.08103881657237896
iter #23 Loss: 0.07967995588317552
iter #24 Loss: 0.07916025585873658
iter #25 Loss: 0.07744263784240579
iter #26 Loss: 0.07676182403360181
iter #27 Loss: 0.07510342906274454
iter #28 Loss: 0.07455124867046276
i

In [20]:
# By training the model, we will have tuned latent factors for products and products. 
# Latent variable is a variable that cannot be observed. 
# The presence of latent variables, however, can be detected by their effects on variables that are observable

c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
            uw = param.data
            c +=1
        else:
            iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.1787, 0.1898, 0.1723,  ..., 0.1976, 0.1754, 0.1653],
        [0.2602, 0.2574, 0.2483,  ..., 0.2311, 0.2198, 0.2338],
        [0.2878, 0.2623, 0.2757,  ..., 0.2540, 0.2686, 0.2834],
        ...,
        [0.1285, 0.1364, 0.1275,  ..., 0.1112, 0.1330, 0.1124],
        [0.2047, 0.2517, 0.2447,  ..., 0.2241, 0.2496, 0.2470],
        [0.0721, 0.0898, 0.0913,  ..., 0.0953, 0.1054, 0.0712]],
       device='cuda:0')
item_factors.weight tensor([[1.9619, 1.9537, 1.9768, 1.9442, 1.9722, 1.9388, 1.9521, 1.9885],
        [1.6227, 1.6262, 1.6414, 1.6565, 1.6317, 1.6540, 1.6584, 1.6623],
        [2.2944, 2.2435, 2.2793, 2.2488, 2.2682, 2.2645, 2.2637, 2.2333],
        [2.0625, 2.0695, 2.0722, 2.1038, 2.0890, 2.0969, 2.0908, 2.1062],
        [2.0603, 2.0697, 2.0903, 2.0522, 2.0743, 2.0727, 2.0519, 2.1046],
        [1.4712, 1.4734, 1.4931, 1.4934, 1.5190, 1.4736, 1.4995, 1.4697],
        [2.1033, 2.1203, 2.1160, 2.0962, 2.1151, 2.1297, 2.1023, 2.1213],
        [1.6507, 1.6

In [21]:
trained_products_embeddings = model.item_factors.weight.data.cpu().numpy()

In [22]:
len(trained_products_embeddings) # Unique products factor weights !

60

In [23]:
from sklearn.cluster import KMeans
# Fit the clusters based on the products weights
kmeans = KMeans(n_clusters=4, random_state=0).fit(trained_products_embeddings) #Orismos CLUSTERS

In [32]:
# It can be seen here that the products that are in the same cluster tend to have
# similar categories.
# Also note that the algorithm is unfamiliar with the product name
# and only obtained the relationships by looking at the numbers representing how
# users have responded to the products selections.


#In our situation we decided to it for 4 Cluster as our Categories 

for cluster in range(4):
    print("Cluster #{}".format(cluster))
    prods = []
    for prodidx in np.where(kmeans.labels_ == cluster)[0]:
        prodid = train_set.idx2productid[prodidx]
        rat_count = ratings_df.loc[ratings_df['productId']==prodid].count()[0]
        prods.append((products_names[prodid], rat_count))
    for prod in sorted(prods, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", prod[0])

Cluster #0
	 Samsung Galaxy S21
	 Xiaomi 12X
	 Samsung Galaxy S21 Ultra
	 One Plus Nord 2
	 Xiaomi Redmi Note 10
	 Samsung Galaxy S20
	 Huawei Nova 9
	 Xiaomi Redmi Note 10 Pro
	 Google Pixel 6 Pro
	 Iphone 12
Cluster #1
	 One Plus 8 Pro
	 Iphone 13 Mini
	 Iphone 13 Pro
	 Huawei 8i
	 Xiaomi Redmi Note 11
	 Samsung Galaxy S22 Plus
	 Iphone 11 Pro Max
	 Google Pixel 6a
	 Realme 9 Pro Plus
	 Realme GT Neo 2
Cluster #2
	 Iphone 11 Pro
	 Google Pixel 5a
	 Samsung Galaxy S20 Fe
	 Xiaomi Poco F3
	 Huawei P40 lite
	 Realme GT 2 Pro
	 Huawei P50 Pocket
	 One Plus 10 Pro
	 Realme Gt Neo 2
	 Huawei P30
Cluster #3
	 Xiaomi Poco M4 Pro
	 Xiaomi Redmi Note 11 Pro
	 Iphone 12 Mini
	 Huawei P50
	 Huawei P40
	 Samsung Galaxy S20 Plus
	 One Plus 8T
	 Realme 9 Pro
	 Google Pixel 5
	 Iphone 12 Pro
