# Artificial Intelligence project
#### By Chris Alexander and Viveka Salinamakki

In [1]:
from pydataset import data
import torch as pyTorch
import numpy as np
import pandas as pd

from fastai.collab import CollabDataLoaders

C:\Users\vivek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\vivek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
#loading the dataset
ratings = pd.read_csv('u.data', delimiter='\t', header=None,
                      usecols=(0,1,2), names=['user','movie','rating'])
ratings.head()

Unnamed: 0,user,movie,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
movies = pd.read_csv('u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,title
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)


In [5]:
noUniqueUsers = len(ratings['user'].unique())
noUniqueMovies = len(ratings['movie'].unique())

In [6]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)

In [7]:
dls.show_batch(), ratings

Unnamed: 0,user,title,rating
0,737,"Shawshank Redemption, The (1994)",4
1,234,Nikita (La Femme Nikita) (1990),3
2,823,Aliens (1986),4
3,416,"Smile Like Yours, A (1997)",3
4,302,Kull the Conqueror (1997),2
5,576,Phenomenon (1996),4
6,243,Sirens (1994),3
7,662,Star Wars (1977),3
8,385,Rosencrantz and Guildenstern Are Dead (1990),5
9,764,Tin Cup (1996),4


(None,
        user  movie  rating                                               title
 0       196    242       3                                        Kolya (1996)
 1        63    242       3                                        Kolya (1996)
 2       226    242       5                                        Kolya (1996)
 3       154    242       3                                        Kolya (1996)
 4       306    242       5                                        Kolya (1996)
 ...     ...    ...     ...                                                 ...
 99995   840   1674       4                                   Mamma Roma (1962)
 99996   655   1640       3                              Eighth Day, The (1996)
 99997   655   1637       3                                   Girls Town (1996)
 99998   655   1630       3  Silence of the Palace, The (Saimt el Qusur) (1994)
 99999   655   1641       3                                     Dadetown (1995)
 
 [100000 rows x 4 columns])

In [61]:
ratings_df=ratings.drop('movie', axis=1)

In [62]:
ratings_df

Unnamed: 0,user,rating,title
0,196,3,Kolya (1996)
1,63,3,Kolya (1996)
2,226,5,Kolya (1996)
3,154,3,Kolya (1996)
4,306,5,Kolya (1996)
...,...,...,...
99995,840,4,Mamma Roma (1962)
99996,655,3,"Eighth Day, The (1996)"
99997,655,3,Girls Town (1996)
99998,655,3,"Silence of the Palace, The (Saimt el Qusur) (1994)"


In [8]:
n_factors = 50 # n_factors i.e. 1 dimension of embeddings (random)

In [9]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()

In [10]:
def getEmbeddings(sizeEmbeddingDict, embeddingVectorSize):
    
    # Intializing the Embedding class with the size of the embedding 
    # dictionary and the size of each embedding vector
    embedding = pyTorch.nn.Embedding(sizeEmbeddingDict, embeddingVectorSize)
    
    # This initializes the weights of the embedding matrix with small 
    # random values drawn from a uniform distribution between -0.01 and 
    # 0.01, which is a common practice in deep learning to provide some 
    # initial variability to the embeddings and allow them to be 
    # learned during training.
    embedding.weight.data.uniform_(-0.01,0.01)

    return embedding

In [11]:
class RecommendationDataset(pyTorch.nn.Module):

    def __init__(self, noUsers, noMovies, hLayerCount=2, dropoutProb1=0.2,
                 dropoutProb2=0.2):
        """
        Arguments:
            noUsers (int): Number of unique users.
            noMovies (int): Number of unique movies.
            hLayerCount (int): Number of hidden layers
            dropoutProb1 (float): Dropout probability 1
            dropoutProb2 (float): Dropout probability 2
        """
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [getEmbeddings(*o) for o in [
            (noUsers, n_factors), (noMovies, n_factors),
            (noUsers,1), (noMovies,1)
        ]]
        
        self.lin1 = pyTorch.nn.Linear(n_factors, hLayerCount)  # bias is True by default
        self.lin2 = pyTorch.nn.Linear(hLayerCount, 1)
        self.drop1 = pyTorch.nn.Dropout(p = dropoutProb1)
        self.drop2 = pyTorch.nn.Dropout(p = dropoutProb2)
    
    def forward(self, catDataUsers, catDataMovies, contDataUsers=0, contDataMovies=0): # forward pass i.e.  dot product of vector from movie embedding matrix
                                    # and vector from user embeddings matrix
        
        # pyTorch.cat : concatenates both embedding matrix to make more columns, same rows i.e. n_factors*2, n : rows
        # u(users) is doing lookup for indexed mentioned in users
        # users has indexes to lookup in embedding matrix. 
        
        u2, m2 = self.u(catDataUsers) , self.m(catDataMovies)
        concatVector = pyTorch.cat([u2,m2])
#         print(len(u2), len(m2), len(concatVector))
        concatVector = self.drop1(concatVector) # drop initialized weights
#         print(len(concatVector))
        layer1Out = self.lin1(concatVector)
        layer1Out = self.drop2(pyTorch.relu(layer1Out)) # drop 1st linear + nonlinear wt
        layer2Out = pyTorch.sigmoid(self.lin2(layer1Out)) * (max_rating - min_rating) + min_rating               
        return layer2Out

In [12]:
model = RecommendationDataset(noUniqueUsers, noUniqueMovies, hLayerCount=2, dropoutProb1=0.2,
                 dropoutProb2=0.2)

# prepare input data
catDataUsers_ip = pyTorch.tensor([1,1,2,2,3,3,4,4])
catDataMovies_ip = pyTorch.tensor([242,1,242,1674,242,1674,242,1674])

In [13]:
print(model)

RecommendationDataset(
  (u): Embedding(943, 50)
  (m): Embedding(1682, 50)
  (ub): Embedding(943, 1)
  (mb): Embedding(1682, 1)
  (lin1): Linear(in_features=50, out_features=2, bias=True)
  (lin2): Linear(in_features=2, out_features=1, bias=True)
  (drop1): Dropout(p=0.2, inplace=False)
  (drop2): Dropout(p=0.2, inplace=False)
)


In [103]:
# make a prediction
prediction = model(catDataUsers_ip, catDataMovies_ip)

#The dot product of the embeddings corresponding to the 2 ids
print("The prediction for the users", catDataUsers_ip," and movies ", catDataMovies_ip," are ",prediction)

The prediction for the users tensor([1, 1, 2, 2, 3, 3, 4, 4])  and movies  tensor([ 242,    1,  242, 1674,  242, 1674,  242, 1674])  are  tensor([[3.1216],
        [3.1214],
        [3.0915],
        [3.1183],
        [3.1178],
        [3.1167],
        [3.1192],
        [3.0901],
        [3.1181],
        [3.1193],
        [3.1188],
        [3.0905],
        [3.1075],
        [3.1175],
        [3.0816],
        [3.1176]], grad_fn=<AddBackward0>)


In [15]:
def retrieveRatings(df, userId, movieId):
    return df.loc[(df['user'] == userId) & (df['movie'] == movieId)]['rating'][1]

In [89]:
retrieveRatings(ratings, 196, 242)

KeyError: 1

In [17]:
def lossCalculation(testUserId, testMovieId):
    # define loss function
    loss_function = pyTorch.nn.MSELoss()

    # prepare input data
    catDataUsers = pyTorch.tensor([testUserId])
    catDataMovies = pyTorch.tensor([testMovieId])

    actualRatings = pyTorch.tensor([retrieveRatings(ratings, testUserId, testMovieId)])

    # make a prediction
    prediction = model(catDataUsers, catDataMovies)

    # calculate the loss
    loss = loss_function(prediction.squeeze(), actualRatings)

    print("Loss for User ID:", testUserId,"and Movie ID:", testMovieId, " is ", loss.item())

In [101]:
np.unique(ratings.loc[:,['user','movie']].values.tolist()), ratings.loc[:,['user','movie']].values.tolist()

(array([   1,    2,    3, ..., 1680, 1681, 1682]),
 [[196, 242],
  [63, 242],
  [226, 242],
  [154, 242],
  [306, 242],
  [296, 242],
  [34, 242],
  [271, 242],
  [201, 242],
  [209, 242],
  [35, 242],
  [354, 242],
  [199, 242],
  [113, 242],
  [1, 242],
  [173, 242],
  [360, 242],
  [234, 242],
  [14, 242],
  [309, 242],
  [331, 242],
  [21, 242],
  [111, 242],
  [439, 242],
  [355, 242],
  [204, 242],
  [145, 242],
  [30, 242],
  [463, 242],
  [144, 242],
  [417, 242],
  [2, 242],
  [497, 242],
  [523, 242],
  [12, 242],
  [202, 242],
  [131, 242],
  [451, 242],
  [532, 242],
  [539, 242],
  [537, 242],
  [416, 242],
  [566, 242],
  [597, 242],
  [181, 242],
  [639, 242],
  [123, 242],
  [520, 242],
  [617, 242],
  [86, 242],
  [624, 242],
  [9, 242],
  [651, 242],
  [492, 242],
  [207, 242],
  [460, 242],
  [697, 242],
  [129, 242],
  [40, 242],
  [740, 242],
  [239, 242],
  [794, 242],
  [461, 242],
  [771, 242],
  [733, 242],
  [568, 242],
  [673, 242],
  [249, 242],
  [6, 242],


In [96]:
def MSE_all(df):
    MSE_avg=0
    for unique_users,unique_movies in np.unique(df.loc[:,['user','movie']].values.tolist()):
        #for unique_movies in df['movie'].unique():
        print(unique_users, unique_movies, retrieveRatings(ratings, unique_users, unique_movies))
        MSE_avg+=lossCalculation(testUserId=unique_users, testMovieId=unique_movies)
        
    return MSE_avg/len(MSE_avg)

In [97]:
MSE_all(ratings)

TypeError: cannot unpack non-iterable numpy.int32 object

In [74]:
optimizer = pyTorch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [75]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(ratings_df.values.tolist()):
        # Every data instance is an input + label pair
        inputs1, inputs2, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs1, inputs2)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(ratings_df.values.tolist()) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [76]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

train_one_epoch(0, writer)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not int

In [71]:
ratings_df.values.tolist()

[[196, 3, 'Kolya (1996)'],
 [63, 3, 'Kolya (1996)'],
 [226, 5, 'Kolya (1996)'],
 [154, 3, 'Kolya (1996)'],
 [306, 5, 'Kolya (1996)'],
 [296, 4, 'Kolya (1996)'],
 [34, 5, 'Kolya (1996)'],
 [271, 4, 'Kolya (1996)'],
 [201, 4, 'Kolya (1996)'],
 [209, 4, 'Kolya (1996)'],
 [35, 2, 'Kolya (1996)'],
 [354, 5, 'Kolya (1996)'],
 [199, 5, 'Kolya (1996)'],
 [113, 2, 'Kolya (1996)'],
 [1, 5, 'Kolya (1996)'],
 [173, 5, 'Kolya (1996)'],
 [360, 4, 'Kolya (1996)'],
 [234, 4, 'Kolya (1996)'],
 [14, 4, 'Kolya (1996)'],
 [309, 4, 'Kolya (1996)'],
 [331, 4, 'Kolya (1996)'],
 [21, 3, 'Kolya (1996)'],
 [111, 4, 'Kolya (1996)'],
 [439, 5, 'Kolya (1996)'],
 [355, 4, 'Kolya (1996)'],
 [204, 5, 'Kolya (1996)'],
 [145, 5, 'Kolya (1996)'],
 [30, 5, 'Kolya (1996)'],
 [463, 2, 'Kolya (1996)'],
 [144, 4, 'Kolya (1996)'],
 [417, 3, 'Kolya (1996)'],
 [2, 5, 'Kolya (1996)'],
 [497, 1, 'Kolya (1996)'],
 [523, 5, 'Kolya (1996)'],
 [12, 5, 'Kolya (1996)'],
 [202, 3, 'Kolya (1996)'],
 [131, 5, 'Kolya (1996)'],
 [451, 1, 'K

In [51]:
df=pd.DataFrame(dls.show_batch())
df

Unnamed: 0,user,title,rating
0,405,Three Colors: Red (1994),1
1,125,"Sting, The (1973)",5
2,187,Field of Dreams (1989),3
3,565,Wings of Desire (1987),3
4,234,Like Water For Chocolate (Como agua para chocolate) (1992),4
5,447,Dead Man Walking (1995),2
6,907,If Lucy Fell (1996),4
7,493,In the Company of Men (1997),3
8,95,"Mark of Zorro, The (1940)",4
9,452,"Bridges of Madison County, The (1995)",3


In [113]:
import torch
import torch.nn as nn
import torch.optim as optim

# Set up the data
noUniqueUsers = max(ratings['user']) + 1
noUniqueMovies = max(ratings['movie']) + 1
X = torch.tensor(ratings[['user', 'movie']].values)
y = torch.tensor(ratings['rating'].values, dtype=torch.float32)

# Split the data into train and test sets
train_size = int(0.8 * len(X))
train_indices = torch.randperm(len(X))[:train_size]
test_indices = torch.arange(len(X))[~train_indices]
X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]

# Define the model and optimizer
model = RecommendationDataset(noUniqueUsers, noUniqueMovies, hLayerCount=2, dropoutProb1=0.2,
                              dropoutProb2=0.2)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
batch_size = 64
num_epochs = 10
# train_losses, test_losses = [], []
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        batch_X, batch_y = X_train[i:i+batch_size], y_train[i:i+batch_size]
        preds = model(*batch_X.T)
        #loss = nn.MSELoss()(preds, batch_y)
        #loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    model.eval()
    with torch.no_grad():
        preds = model(*X_test.T)
        #loss = nn.MSELoss()(preds, y_test)
        #test_losses.append(loss.item())
#     print(f"Epoch {epoch+1}, train loss: {train_losses[-1]:.4f}, test loss: {test_losses[-1]:.4f}")

# Make predictions for all users and movies
model.eval()
with torch.no_grad():
    all_users = torch.arange(noUniqueUsers)
    all_movies = torch.arange(noUniqueMovies)
    all_pairs = torch.cartesian_prod(all_users, all_movies)
    all_preds = model(*all_pairs.T)

# Extract the predictions for a specific user
user_id = 1
user_movies = torch.arange(noUniqueMovies)[X[:, 0] == user_id]
user_preds = all_preds[X[:, 0] == user_id]

# Get the top recommended movies for the user
_, top_movie_indices = torch.topk(user_preds, k=10)
top_movie_ids = user_movies[top_movie_indices]

print(f"Top 10 recommended movies for user {user_id}:")
print(movies[movies.movieId.isin(top_movie_ids)][['movieId', 'title']])


IndexError: The shape of the mask [100000] at index 0 does not match the shape of the indexed tensor [1683] at index 0

In [108]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the model and optimizer
model = RecommendationDataset(noUniqueUsers, noUniqueMovies, hLayerCount=2, dropoutProb1=0.2,
                              dropoutProb2=0.2)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
batch_size = 64
num_epochs = 10
train_losses, test_losses = [], []

X_train = np.array(ratings.loc[:,['user', 'title']])
y_train = np.array(ratings.loc[:,'rating'])

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        batch_X, batch_y = X_train[i:i+batch_size], y_train[i:i+batch_size]
        preds = model(*batch_X.T)
        loss = nn.MSELoss()(preds, batch_y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    model.eval()
    with torch.no_grad():
        preds = model(*X_test.T)
        loss = nn.MSELoss()(preds, y_test)
        test_losses.append(loss.item())
    print(f"Epoch {epoch+1}, train loss: {train_losses[-1]:.4f}, test loss: {test_losses[-1]:.4f}")

# Make predictions for all users and movies
model.eval()
with torch.no_grad():
    all_users = torch.arange(noUniqueUsers)
    all_movies = torch.arange(noUniqueMovies)
    all_pairs = torch.cartesian_prod(all_users, all_movies)
    all_preds = model(*all_pairs.T)

# Extract the predictions for a specific user
user_id = 1
user_movies = torch.arange(noUniqueMovies)[X[:, 0] == user_id]
user_preds = all_preds[X[:, 0] == user_id]

# Get the top recommended movies for the user
_, top_movie_indices = torch.topk(user_preds, k=10)
top_movie_ids = user_movies[top_movie_indices]

print(f"Top 10 recommended movies for user {user_id}:")
print(movies[movies.movieId.isin(top_movie_ids)][['movieId', 'title']])

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not numpy.ndarray

In [138]:

all_users = ratings['user'].unique()
all_movies = ratings['movie'].unique()

In [139]:
def get_user_ratings(user):
    user_ratings = ratings[ratings['user'] == user]['rating'].tolist()
    movie_ratings = [0] * len(all_movies)
    for i, movie in enumerate(all_movies):
        if movie in user_ratings:
            movie_ratings[i] = user_ratings.count(movie)
    return movie_ratings

In [140]:
u_ww = model.u.weight[1:].cpu().detach().numpy()
i_ww = model.m.weight[1:].cpu().detach().numpy().transpose()


# Tensor of weights for u_weight
# u_ww = model.embeds[0].weight[1:].cpu().detach().numpy()
u_ww.shape

# Tensor of weights for i_weight
# i_ww = (model.embeds[1].weight[1:].cpu().detach().numpy()).transpose()
i_ww.shape

# Dot product similarity between all users and items
array = np.dot(u_ww, i_ww)



# Matrix with rating output
array = np.zeros((len(all_users), len(all_movies)))
for i, user in enumerate(all_users):
    array[i,:] = get_user_ratings(user)

matrix = pd.DataFrame(array, index = all_users.tolist()) # convert to list

# Rank top 50 venues per user
top_50 = {}
for user in all_users:
    sorted_ratings = matrix.loc[user].sort_values(ascending=False)
    top_50[user] = sorted_ratings.index[:50].tolist()





# Matrix with rating output
matrix = pd.DataFrame(array, index = all_users)

# Rank top 5 venues per user
all_rec_df = pd.DataFrame(matrix.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=5))

In [141]:
all_rec_df

Unnamed: 0,0,1,2,3,4
196,47,233,75,147,24
63,233,47,147,75,24
226,47,233,75,147,24
154,47,75,233,147,0
306,47,75,233,147,24
...,...,...,...,...,...
799,47,75,233,147,0
358,47,75,233,147,24
410,233,47,147,24,75
598,47,75,233,147,0


In [142]:
all_rec_df_title = all_rec_df.replace(dict(zip(movies['movie'], movies['title'])))

In [143]:
all_rec_df_title

Unnamed: 0,0,1,2,3,4
196,Ed Wood (1994),Under Siege (1992),Brother Minister: The Assassination of Malcolm X (1994),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
63,Under Siege (1992),Ed Wood (1994),"Long Kiss Goodnight, The (1996)",Brother Minister: The Assassination of Malcolm X (1994),Rumble in the Bronx (1995)
226,Ed Wood (1994),Under Siege (1992),Brother Minister: The Assassination of Malcolm X (1994),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
154,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",0
306,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
...,...,...,...,...,...
799,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",0
358,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995)
410,Under Siege (1992),Ed Wood (1994),"Long Kiss Goodnight, The (1996)",Rumble in the Bronx (1995),Brother Minister: The Assassination of Malcolm X (1994)
598,Ed Wood (1994),Brother Minister: The Assassination of Malcolm X (1994),Under Siege (1992),"Long Kiss Goodnight, The (1996)",0


In [144]:
all_rec_df_title[0].unique()

array(['Ed Wood (1994)', 'Under Siege (1992)',
       'Brother Minister: The Assassination of Malcolm X (1994)',
       'Long Kiss Goodnight, The (1996)', 'Rumble in the Bronx (1995)'],
      dtype=object)