## AutoEncoder

*Project Description*
- Develop a model to predict a users next movie rating.


Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

from torchmetrics import Accuracy
import warnings

# Set up device agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### 1) Data Handling

- Below dataset is used to train autoencoder on 1 million user ratings.
- Use dataset for systems that are able to train on larger datasets.
- The current model is tested off a dataset containing 100k user ratings.

In [2]:
# movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
# users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
# ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

Creating the Testing and Training sets. (100k user data)
- 100k dataset has 5 versions of Train/Test splits.
- Use first of these splits "u1" to train autoencoder.

In [3]:
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t', header = None)
print(f"Training set:\n{training_set.head(2)}")
print("Column Keys: \n 0: Users \n 1: Movies \n 2: Ratings \n 3: Timestamps")
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t', header = None)
test_set = np.array(test_set, dtype = 'int')

Training set:
   0  1  2          3
0  1  1  5  874965758
1  1  2  3  876893171
Column Keys: 
 0: Users 
 1: Movies 
 2: Ratings 
 3: Timestamps


- Find the maximum value of users and movies.
- Max could be located in either the training or the test set.
- Therefore, find the max value in both training and test set, then return the max of the resut.

In [4]:
nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1])))
print(f"Number of Users : {nb_users}")
print(f"Number of Movies: {nb_movies}")

mins = int(min(min(training_set[:, 1], ), min(test_set[:, 1])))

Number of Users : 943
Number of Movies: 1682


In [5]:
def convertData(data:np.array):
    """ 
        Convert training data into 2D matrix where row indicies equal users and
        columns equal movie id. Cell reffers to user rating.
        
        Fill 0 for user who did not rate a movie.
        
        Information from training data:
        Column 1: User -> min user = 1, max user = 943
        Column 2: Movie -> min id = 1, max id = 1682
        Column 3: Rating -> min rating = 0, max rating = 5
        Column 4: Timestamp -> not used for training
    
    """
    convert_data = []
    for user in range(1, nb_users + 1):

        # Preallocate array with len of total movies in database
        zeros = [0] * (nb_movies + 1)

        # Gather all reviews a user has made
        single_user_reviews = data[data[:, 0] == user]

        # Fill loc zero with user rating that is in same column as movie id.
        for rating in single_user_reviews:
            zeros[rating[1]] = rating[2]
        
        # Remove first element in zeros because that column loc indicates a movie id of 0.
        # In database there does not exist a movie is with 0. (min id = 1)
        zeros.pop(0)
        convert_data.append(zeros)
    
    return convert_data # (943, 1682) based of u1 dataset

training_set = convertData(training_set)
test_set = convertData(test_set)

Notes about converted datasets:
- A users row will contain many rattings with 0. This is because a user is not expected to review all moves in datasets.

In [6]:
warnings.filterwarnings('ignore')
training_set = torch.tensor(training_set, dtype=torch.float, device=device)
test_set = torch.tensor(training_set, dtype=torch.float, device=device)
warnings.filterwarnings('default')

### 2) Develop Model

In [7]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(nb_movies, 20),
            nn.Linear(20, 10),
            nn.Linear(10, 20),
            nn.Linear(20, nb_movies)
        )
        self.activation = nn.Sigmoid()
    def forward(self, x):   
       return self.activation(self.layers(x))
   
sae = SAE().to(device)
loss_fn = nn.L1Loss()
optimizer = optim.Adam(sae.parameters(), lr = 0.001, weight_decay= 0.50)  

### 3) Train Model

In [8]:
nb_epoch = 200

for epoch in range(1, nb_epoch + 1):
    
    sae.train()
    train_loss = 0

    # Count number of users that rated 1 movie
    # Do not count users who did not rate any movie for memory optimization
    num_users_who_rate_at_least_1_movie = 0. # Will use to compute RMS error

    for id_user in range(nb_users):
        input_x = training_set[id_user].unsqueeze(0).requires_grad_()
        target = input_x.clone().detach() # removes gradient tracking from target

        # Only train on data where user rated at least one movie.
        if torch.sum(target.data > 0) > 0:
            
            output = sae(input_x)
           
            """ 
                Below is necessary for optimization and model performance.

                If a user did not rate a movie and the model predicts their rating to be high,
                the model will calulate this RMS error and will adjust inappropriately:

                ex/ User did not rate movie = 0 
                    Model predicts = 3

                    This result will carry over to loss and assume a loss of 3. 
                    Model loss will theerefore have a difficult time decreasing.
            """
            # output[target == 0] = 0
            output_2 = torch.where(target == 0, 0., output).requires_grad_()
           
            loss = loss_fn(output_2, target)
            # if id_user % 100 == 0:
            #     print(f"Sum of target: {torch.sum(target.data > 0)}  -  Difference: {torch.sum(torch.subtract(target.data, output_2) > 0)}")
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            
            loss.backward()
            train_loss += np.sqrt(loss.item() * mean_corrector)
            num_users_who_rate_at_least_1_movie += 1.

            optimizer.step()

    if epoch % 20 == 0:
        print('epoch: ' + str(epoch) + ' loss: '+ str(train_loss/num_users_who_rate_at_least_1_movie))

epoch: 20 loss: 1.6024633166778433
epoch: 40 loss: 1.6023816835638196
epoch: 60 loss: 1.6023639545319674
epoch: 80 loss: 1.6023568041301266
epoch: 100 loss: 1.6023530621869746
epoch: 120 loss: 1.6023508014553458
epoch: 140 loss: 1.602349296990143
epoch: 160 loss: 1.6023482383991303
epoch: 180 loss: 1.6023474541859781
epoch: 200 loss: 1.6023468486579169


### 4) Test Model

In [9]:
with torch.inference_mode():
    sae.eval()
    test_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input_x = test_set[id_user].unsqueeze(0).requires_grad_()
        target = input_x.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input_x)
            target.require_grad = False
            output[target == 0] = 0
            loss = loss_fn(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            test_loss += np.sqrt(loss.item()*mean_corrector)
            s += 1.

print('test loss: '+ str(test_loss/s))

test loss: 1.6023468302387454


In [10]:
with torch.no_grad():
  print(sae(training_set[0]))

tensor([1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')


Model takes a while to train on most systems.
Save / Load model state after training in case errors occur.

In [11]:
# torch.save(sae, r'/model_sae.pt')