# Artificial Intelligence project
#### By Chris Alexander and Viveka Salinamakki

In [1]:
from pydataset import data
import torch as pyTorch
import numpy as np
import pandas as pd

from fastai.collab import CollabDataLoaders

C:\Users\vivek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\vivek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
#loading the dataset
ratings = pd.read_csv('u.data', delimiter='\t', header=None,
                      usecols=(0,1,2), names=['user','movie','rating'])
ratings.head()

Unnamed: 0,user,movie,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
movies = pd.read_csv('u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,title
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)


In [5]:
noUniqueUsers = len(ratings['user'].unique())
noUniqueMovies = len(ratings['movie'].unique())

In [6]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)

In [7]:
dls.show_batch(), ratings

Unnamed: 0,user,title,rating
0,878,Brazil (1985),2
1,343,Stand by Me (1986),5
2,395,Apt Pupil (1998),5
3,345,In & Out (1997),4
4,58,C'est arrivé près de chez vous (1992),2
5,797,Gone Fishin' (1997),2
6,936,Romy and Michele's High School Reunion (1997),3
7,666,Full Metal Jacket (1987),5
8,621,What's Eating Gilbert Grape (1993),3
9,33,Conspiracy Theory (1997),4


(None,
        user  movie  rating                                               title
 0       196    242       3                                        Kolya (1996)
 1        63    242       3                                        Kolya (1996)
 2       226    242       5                                        Kolya (1996)
 3       154    242       3                                        Kolya (1996)
 4       306    242       5                                        Kolya (1996)
 ...     ...    ...     ...                                                 ...
 99995   840   1674       4                                   Mamma Roma (1962)
 99996   655   1640       3                              Eighth Day, The (1996)
 99997   655   1637       3                                   Girls Town (1996)
 99998   655   1630       3  Silence of the Palace, The (Saimt el Qusur) (1994)
 99999   655   1641       3                                     Dadetown (1995)
 
 [100000 rows x 4 columns])

In [8]:
ratings_df=ratings.drop('movie', axis=1)

In [9]:
ratings_df

Unnamed: 0,user,rating,title
0,196,3,Kolya (1996)
1,63,3,Kolya (1996)
2,226,5,Kolya (1996)
3,154,3,Kolya (1996)
4,306,5,Kolya (1996)
...,...,...,...
99995,840,4,Mamma Roma (1962)
99996,655,3,"Eighth Day, The (1996)"
99997,655,3,Girls Town (1996)
99998,655,3,"Silence of the Palace, The (Saimt el Qusur) (1994)"


In [10]:
n_factors = 50 # n_factors i.e. 1 dimension of embeddings (random)

In [11]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()

In [12]:
def getEmbeddings(sizeEmbeddingDict, embeddingVectorSize):
    
    # Intializing the Embedding class with the size of the embedding 
    # dictionary and the size of each embedding vector
    embedding = pyTorch.nn.Embedding(sizeEmbeddingDict, embeddingVectorSize)
    
    # This initializes the weights of the embedding matrix with small 
    # random values drawn from a uniform distribution between -0.01 and 
    # 0.01, which is a common practice in deep learning to provide some 
    # initial variability to the embeddings and allow them to be 
    # learned during training.
    embedding.weight.data.uniform_(-0.01,0.01)

    return embedding

In [13]:
class RecommendationDataset(pyTorch.nn.Module):

    def __init__(self, noUsers, noMovies, hLayerCount=2, dropoutProb1=0.2,
                 dropoutProb2=0.2):
        """
        Arguments:
            noUsers (int): Number of unique users.
            noMovies (int): Number of unique movies.
            hLayerCount (int): Number of hidden layers
            dropoutProb1 (float): Dropout probability 1
            dropoutProb2 (float): Dropout probability 2
        """
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [getEmbeddings(*o) for o in [
            (noUsers, n_factors), (noMovies, n_factors),
            (noUsers,1), (noMovies,1)
        ]]
        
        self.lin1 = pyTorch.nn.Linear(n_factors, hLayerCount)  # bias is True by default
        self.lin2 = pyTorch.nn.Linear(hLayerCount, 1)
        self.drop1 = pyTorch.nn.Dropout(p = dropoutProb1)
        self.drop2 = pyTorch.nn.Dropout(p = dropoutProb2)
    
    def forward(self, catDataUsers, catDataMovies, contDataUsers=0, contDataMovies=0): # forward pass i.e.  dot product of vector from movie embedding matrix
                                    # and vector from user embeddings matrix
        
        # pyTorch.cat : concatenates both embedding matrix to make more columns, same rows i.e. n_factors*2, n : rows
        # u(users) is doing lookup for indexed mentioned in users
        # users has indexes to lookup in embedding matrix. 
        
        u2, m2 = self.u(catDataUsers) , self.m(catDataMovies)
        concatVector = pyTorch.cat([u2,m2])
#         print(len(u2), len(m2), len(concatVector))
        concatVector = self.drop1(concatVector) # drop initialized weights
#         print(len(concatVector))
        layer1Out = self.lin1(concatVector)
        layer1Out = self.drop2(pyTorch.relu(layer1Out)) # drop 1st linear + nonlinear wt
        layer2Out = pyTorch.sigmoid(self.lin2(layer1Out)) * (max_rating - min_rating) + min_rating               
        return layer2Out

In [14]:
model = RecommendationDataset(noUniqueUsers, noUniqueMovies, hLayerCount=2, dropoutProb1=0.2,
                 dropoutProb2=0.2)

# prepare input data
catDataUsers_ip = pyTorch.tensor([1,1,2,2,3,3,4,4])
catDataMovies_ip = pyTorch.tensor([242,1,242,1674,242,1674,242,1674])

In [15]:
print(model)

RecommendationDataset(
  (u): Embedding(943, 50)
  (m): Embedding(1682, 50)
  (ub): Embedding(943, 1)
  (mb): Embedding(1682, 1)
  (lin1): Linear(in_features=50, out_features=2, bias=True)
  (lin2): Linear(in_features=2, out_features=1, bias=True)
  (drop1): Dropout(p=0.2, inplace=False)
  (drop2): Dropout(p=0.2, inplace=False)
)


In [16]:
# make a prediction
prediction = model(catDataUsers_ip, catDataMovies_ip)

#The dot product of the embeddings corresponding to the 2 ids
print("The prediction for the users", catDataUsers_ip," and movies ", catDataMovies_ip," are ",prediction)

The prediction for the users tensor([1, 1, 2, 2, 3, 3, 4, 4])  and movies  tensor([ 242,    1,  242, 1674,  242, 1674,  242, 1674])  are  tensor([[2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619],
        [2.6619]], grad_fn=<AddBackward0>)


In [17]:
def retrieveRatings(df, userId, movieId):
    return df.loc[(df['user'] == userId) & (df['movie'] == movieId)]['rating'][1]

In [18]:
def lossCalculation(testUserId, testMovieId):
    # define loss function
    loss_function = pyTorch.nn.MSELoss()

    # prepare input data
    catDataUsers = pyTorch.tensor([testUserId])
    catDataMovies = pyTorch.tensor([testMovieId])

    actualRatings = pyTorch.tensor([retrieveRatings(ratings, testUserId, testMovieId)])

    # make a prediction
    prediction = model(catDataUsers, catDataMovies)

    # calculate the loss
    loss = loss_function(prediction.squeeze(), actualRatings)

    print("Loss for User ID:", testUserId,"and Movie ID:", testMovieId, " is ", loss.item())

In [19]:
all_users = ratings['user'].unique()
all_movies = ratings['movie'].unique()

In [20]:
def get_user_ratings(user):
    user_ratings = ratings[ratings['user'] == user]['rating'].tolist()
    movie_ratings = [0] * len(all_movies)
    for i, movie in enumerate(all_movies):
        if movie in user_ratings:
            movie_ratings[i] = user_ratings.count(movie)
    return movie_ratings

In [21]:
u_ww = model.u.weight[1:].cpu().detach().numpy()
i_ww = model.m.weight[1:].cpu().detach().numpy().transpose()


# Tensor of weights for u_weight
# u_ww = model.embeds[0].weight[1:].cpu().detach().numpy()
u_ww.shape

# Tensor of weights for i_weight
# i_ww = (model.embeds[1].weight[1:].cpu().detach().numpy()).transpose()
i_ww.shape

# Dot product similarity between all users and items
array = np.dot(u_ww, i_ww)



# Matrix with rating output
array = np.zeros((len(all_users), len(all_movies)))
for i, user in enumerate(all_users):
    array[i,:] = get_user_ratings(user)

matrix = pd.DataFrame(array, index = all_users.tolist()) # convert to list

# Rank top 50 venues per user
top_50 = {}
for user in all_users:
    sorted_ratings = matrix.loc[user].sort_values(ascending=False)
    top_50[user] = sorted_ratings.index[:50].tolist()





# Matrix with rating output
matrix = pd.DataFrame(array, index = all_users)

# Rank top 5 venues per user
all_rec_df = pd.DataFrame(matrix.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=5))

In [22]:
all_rec_df

Unnamed: 0,0,1,2,3,4
196,47,233,75,147,24
63,233,47,147,75,24
226,47,233,75,147,24
154,47,75,233,147,0
306,47,75,233,147,24
...,...,...,...,...,...
799,47,75,233,147,0
358,47,75,233,147,24
410,233,47,147,24,75
598,47,75,233,147,0


In [23]:
all_rec_df_title = all_rec_df.replace(dict(zip(movies['movie'], movies['title'])))

In [24]:
all_rec_df_title

Unnamed: 0,0,1,2,3,4
196,Ed Wood (1994),Under Siege (1992),Brother Minister: The Assassination of Malcolm X (1994),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
63,Under Siege (1992),Ed Wood (1994),"Long Kiss Goodnight, The (1996)",Brother Minister: The Assassination of Malcolm X (1994),Rumble in the Bronx (1995)
226,Ed Wood (1994),Under Siege (1992),Brother Minister: The Assassination of Malcolm X (1994),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
154,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",0
306,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
...,...,...,...,...,...
799,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",0
358,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
410,Under Siege (1992),Ed Wood (1994),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995),Brother Minister: The Assassination of Malcolm X (1994)
598,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",0
