# Artificial Intelligence project
#### By Chris Alexander and Viveka Salinamakki

In [1]:
from pydataset import data
import torch as pyTorch
import numpy as np
import pandas as pd

from fastai.collab import CollabDataLoaders

C:\Users\vivek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\vivek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
#loading the dataset
ratings = pd.read_csv('u.data', delimiter='\t', header=None,
                      usecols=(0,1,2), names=['user','movie','rating'])
ratings.head()

Unnamed: 0,user,movie,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
movies = pd.read_csv('u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,title
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)


In [5]:
noUniqueUsers = len(ratings['user'].unique())
noUniqueMovies = len(ratings['movie'].unique())

In [6]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)

In [84]:
dls.show_batch(), ratings

Unnamed: 0,user,title,rating
0,868,"Davy Crockett, King of the Wild Frontier (1955)",2
1,152,Jaws (1975),4
2,312,"Misérables, Les (1995)",5
3,222,Major Payne (1994),2
4,498,"Blues Brothers, The (1980)",4
5,710,Starship Troopers (1997),3
6,92,Turbulence (1997),2
7,224,Amadeus (1984),4
8,360,Ulee's Gold (1997),3
9,286,"Parent Trap, The (1961)",3


(None,
        user  movie  rating                                               title
 0       196    242       3                                        Kolya (1996)
 1        63    242       3                                        Kolya (1996)
 2       226    242       5                                        Kolya (1996)
 3       154    242       3                                        Kolya (1996)
 4       306    242       5                                        Kolya (1996)
 ...     ...    ...     ...                                                 ...
 99995   840   1674       4                                   Mamma Roma (1962)
 99996   655   1640       3                              Eighth Day, The (1996)
 99997   655   1637       3                                   Girls Town (1996)
 99998   655   1630       3  Silence of the Palace, The (Saimt el Qusur) (1994)
 99999   655   1641       3                                     Dadetown (1995)
 
 [100000 rows x 4 columns])

In [8]:
n_factors = 50 # n_factors i.e. 1 dimension of embeddings (random)

In [9]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()

In [10]:
def getEmbeddings(sizeEmbeddingDict, embeddingVectorSize):
    
    # Intializing the Embedding class with the size of the embedding 
    # dictionary and the size of each embedding vector
    embedding = pyTorch.nn.Embedding(sizeEmbeddingDict, embeddingVectorSize)
    
    # This initializes the weights of the embedding matrix with small 
    # random values drawn from a uniform distribution between -0.01 and 
    # 0.01, which is a common practice in deep learning to provide some 
    # initial variability to the embeddings and allow them to be 
    # learned during training.
    embedding.weight.data.uniform_(-0.01,0.01)

    return embedding

In [91]:
class RecommendationDataset(pyTorch.nn.Module):

    def __init__(self, noUsers, noMovies, hLayerCount=2, dropoutProb1=0.2,
                 dropoutProb2=0.2):
        """
        Arguments:
            noUsers (int): Number of unique users.
            noMovies (int): Number of unique movies.
            hLayerCount (int): Number of hidden layers
            dropoutProb1 (float): Dropout probability 1
            dropoutProb2 (float): Dropout probability 2
        """
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [getEmbeddings(*o) for o in [
            (noUsers, n_factors), (noMovies, n_factors),
            (noUsers,1), (noMovies,1)
        ]]
        
        self.lin1 = pyTorch.nn.Linear(n_factors, hLayerCount)  # bias is True by default
        self.lin2 = pyTorch.nn.Linear(hLayerCount, 1)
        self.drop1 = pyTorch.nn.Dropout(p = dropoutProb1)
        self.drop2 = pyTorch.nn.Dropout(p = dropoutProb2)
    
    def forward(self, catDataUsers, catDataMovies, contDataUsers=0, contDataMovies=0): # forward pass i.e.  dot product of vector from movie embedding matrix
                                    # and vector from user embeddings matrix
        
        # torch.cat : concatenates both embedding matrix to make more columns, same rows i.e. n_factors*2, n : rows
        # u(users) is doing lookup for indexed mentioned in users
        # users has indexes to lookup in embedding matrix. 
        
        u2, m2 = self.u(catDataUsers) , self.m(catDataMovies)
        concatVector = pyTorch.cat([u2,m2])
#         print(len(u2), len(m2), len(concatVector))
        concatVector = self.drop1(concatVector) # drop initialized weights
#         print(len(concatVector))
        layer1Out = self.lin1(concatVector)
        layer1Out = self.drop2(pyTorch.relu(layer1Out)) # drop 1st linear + nonlinear wt
        layer2Out = pyTorch.sigmoid(self.lin2(layer1Out)) * (max_rating - min_rating) + min_rating               
        return layer2Out

In [92]:
# wd=1e-5
# embeddingsModel = getEmbeddings(noUsers, noMovies)
# embeddingsModel = embeddingsModel.cuda()
# opt = pyTorch.optim.Adam(embeddingsModel.parameters(), 1e-3, weight_decay=wd) # got parameter() for free , lr = 1e-3
# predModel = RecommendationDataset(opt)

In [117]:
model = RecommendationDataset(noUniqueUsers, noUniqueMovies, hLayerCount=2, dropoutProb1=0.2,
                 dropoutProb2=0.2)

# load pre-trained model weights (if any)
# model.load_state_dict(torch.load('model_weights.pth'))

# prepare input data
catDataUsers = pyTorch.tensor([1,1,2,2,3,3,4,4])
catDataMovies = pyTorch.tensor([242,1,242,1674,242,1674,242,1674])
# contDataUsers = pyTorch.tensor([0.6])
# contDataMovies = pyTorch.tensor([0.8])

In [147]:
# make a prediction
prediction = model(catDataUsers, catDataMovies)

#The dot product of the embeddings corresponding to the 2 ids
print(prediction)

tensor([[2.7693],
        [2.7689],
        [2.7552],
        [2.7729],
        [2.7119],
        [2.7126],
        [2.7756],
        [2.7737],
        [2.7699],
        [2.7811],
        [2.7737],
        [2.7788],
        [2.7725],
        [2.7725],
        [2.7727],
        [2.7766]], grad_fn=<AddBackward0>)


In [148]:
# embeddingsModel.forward(catDataUsers=noUniqueUsers, catDataMovies=noUniqueMovies)

In [182]:
def retrieveRatings(df, userId, movieId):
    return df.loc[(df['user'] == userId) & (df['movie'] == movieId)]['rating'][1]

In [183]:
retrieveRatings(ratings, 63, 242)

3

In [203]:
# ratings.loc[(ratings['user'] == 63) & (ratings['movie'] == 242)]['rating'][1]

In [204]:
# define loss function
loss_function = pyTorch.nn.MSELoss()

testUserId=63
testMovieId=242

# prepare input data
catDataUsers = pyTorch.tensor([testUserId])
catDataMovies = pyTorch.tensor([testMovieId])

actualRatings = pyTorch.tensor([retrieveRatings(ratings, testUserId, testMovieId)])

# make a prediction
prediction = model(catDataUsers, catDataMovies)

# calculate the loss
loss = loss_function(prediction.squeeze(), actualRatings)

print("Loss for User ID:", testUserId,"and Movie ID:", testMovieId, " is ", loss.item())

Loss for User ID: 63 and Movie ID: 242  is  0.056301720440387726
