In [1]:
import os
import numpy as np
import pandas as pd
import torch

In [2]:
os.chdir("D:\\datasets\movielens")

In [3]:
moviesdf = pd.read_csv("movies.csv")
ratingsdf = pd.read_csv("ratings.csv")
del ratingsdf['timestamp']

In [4]:
ratingsdf.tail()

Unnamed: 0,userId,movieId,rating
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0
100835,610,170875,3.0


In [5]:
#to make a recommendation for a person, add ratings into the dataframe

def addrating(dataframe, movieIds, ratings):
    new_df = pd.DataFrame()
    new_df['movieId'] = pd.Series(movieIds).astype('int64')
    new_df['rating'] = pd.Series(ratings).astype('int64')
    new_df['userId'] = np.int64(900000)
    df = dataframe.append(new_df)
    
    for i in range(len(movieIds)):
        print('rating: ' + str(ratings[i]) + '  ||  ' +
              moviesdf[moviesdf['movieId'] == movieIds[i]]['title'].item())
    
    return df
#when adding our own user ratings we'll choose a larger userId than anything already in the dataframe
print(ratingsdf['userId'].max())

610


In [6]:
movieIds = [26171, 68237, 1251, 1232, 3584, 66097, 6711, 90866]
ratings = [5.0, 5.0, 4.0, 3.5, 3.5, 5.0, 4.0, 4.0]

addrating(ratingsdf, movieIds, ratings)


rating: 5.0  ||  Play Time (a.k.a. Playtime) (1967)
rating: 5.0  ||  Moon (2009)
rating: 4.0  ||  8 1/2 (8½) (1963)
rating: 3.5  ||  Stalker (1979)
rating: 3.5  ||  Breathless (1983)
rating: 5.0  ||  Coraline (2009)
rating: 4.0  ||  Lost in Translation (2003)
rating: 4.0  ||  Hugo (2011)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
3,900000,1232,3.0
4,900000,3584,3.0
5,900000,66097,5.0
6,900000,6711,4.0


In [7]:
data = pd.merge(moviesdf, ratingsdf, on='movieId')

In [8]:
datasetpd = data.pivot_table(index = 'userId', columns = 'movieId', values = 'rating')
datasetpd.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,
610,5.0,,,,,5.0,,,,,...,,,,,,,,,,


In [None]:
##in the event that the dataset is too large for .pivot_table, a manual implementation is provided below


#dataset = {}
#for i in moviesdf['movieId']:
#    movierated = ratingsdf[ratingsdf['movieId'] == i]
#    ser= pd.Series(list(movierated['rating']), list(movierated['userId']))
#    dataset[i] = ser
#datasetpd = pd.DataFrame(dataset)

We consider the loss function, a sum over terms where user $i$ has given a rating for movie $j$. This condition is represented by the function $r(i, j)$ which is equal to 1 whenever the condition holds, 0 otehrwise. There are regularization terms controlled by $\lambda$.
$$J = \frac{1}{2} \sum_{(i,j): r(i,j) = 1} ({\theta^{(j)}}^{T} x^{(i)} - y^{(i,j)})^2 + \frac{\lambda}{2}\sum_{i, k} x^{(i)}_k + \frac{\lambda}{2}\sum_{j, k} \theta^{(j)}_k$$

In the above formula, the row vector $\theta^{(j)}$ represents some latent features of movie $j$ to be learned, and $x^{(i)}$ represents some learned representation of user $i$

In [9]:
#Rmat will be important in determining which values to sum over.
#Entry (i,j) is True when user i gives a rating for movie j, and False otherwise.

Rmat = datasetpd.isnull()
Rmat.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,True,False,True,True,False,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
5,False,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [10]:
#it is sufficient to regard missing values (ie user i has not rated movie j) in the target matrix as having 0 value,
#as we will use Rmat to ensure that the corresponding terms are 0 when we evaluate the loss function.
datasetpdclean = datasetpd.fillna(value=0)

In [11]:
data_np = datasetpdclean.values
Rmat_np = Rmat.values
Rmat_np = Rmat_np.astype(float)

data_np.shape

(610, 9724)

In [42]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [120]:
users_torch = torch.randn((610, 20), requires_grad = True, dtype = torch.float, device = device)
movies_torch = torch.randn((20, 9724), requires_grad = True, dtype = torch.float, device = device)

Rmat_torch = 1 - torch.from_numpy(Rmat_np).to(device)
target_torch = torch.from_numpy(data_np).to(device)

### Our model will learn $\texttt{users_torch.shape[1]}$ dimensional vectors for each user and movie. Tentatively, we won't use regularization.

In [121]:
learning_rate = 1e-6
optimizer = torch.optim.SGD([users_torch, movies_torch], lr = learning_rate)

In [122]:
for epoch in range(30000):
    optimizer.zero_grad()
    
    #matrix multiplication
    yhat = torch.mm(users_torch, movies_torch)
    
    #element wise
    to_sum = torch.mul(yhat, Rmat_torch)
    
    error = (to_sum - target_torch)**2
    loss = torch.sum(error)
    
    loss.backward()
    
    optimizer.step()
    if epoch % 100 == 0:
        print(loss.item())

3383482.336797682
2916313.491249787
2616469.4937102697
2405834.5796255264
2248380.362078344
2125401.1160509684
2026180.1582448576
1944073.8852301892
1874703.5871202801
1815043.420098714
1762922.8267635438
1716736.5113093066
1675266.020759148
1637565.1908217715
1602884.1769433368
1570617.8010291934
1540269.7130764616
1511427.3041630443
1483744.0351232511
1456927.0076027
1430728.3660487572
1404939.4991721343
1379387.2482856726
1353931.5686839875
1328464.108789647
1302907.2427878573
1277213.2361574862
1251363.0399475668
1225364.590854973
1199250.2691853093
1173073.5382716116
1146904.7607761724
1120826.4242391386
1094927.9733995576
1069300.762994369
1044033.3432841667
1019207.5080587848
994895.3294383484
971157.2061086262
948040.970046399
925581.905788825
903803.5101589824
882718.6954130344
862331.3467153623
842637.8871473921
823628.8325284212
805290.2357517844
787604.8438319159
770553.1715083735
754114.2326588151
738266.2225699429
722986.9836119842
708254.3224970484
694046.2942510931
6803

In [100]:
learned_users = users_torch.to('cpu')
learned_movies = movies_torch.to('cpu')

user_input = learned_users[-1, :].view(1, 20)

In [101]:
predicted_ratings = torch.mm(user_input, learned_movies).view(-1,).tolist()

In [102]:
movie_indexes = sorted(range(len(predicted_ratings)), key=lambda k: predicted_ratings[k])

In [115]:
top_rated = movie_indexes[-10:]

In [119]:
#results from representing movies and users by 50 dimensional vectors
moviesdf['title'].take(top_rated)

8467                                     Gone Girl (2014)
4386    Man with the Movie Camera, The (Chelovek s kin...
3753                               Road to Morocco (1942)
6509                                   Rescue Dawn (2006)
1164                Lost World: Jurassic Park, The (1997)
6249                                    Little Man (2006)
3289                                    Adanggaman (2000)
8450                                Equalizer, The (2014)
5247                                        Taxi 3 (2003)
7451                    Undisputed III: Redemption (2010)
Name: title, dtype: object