In [None]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 720 kB 8.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 35.8 MB/s 
[K     |████████████████████████████████| 186 kB 61.8 MB/s 
[K     |████████████████████████████████| 46 kB 4.6 MB/s 
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
[K     |████████████████████████████████| 51 kB 346 kB/s 
[?25hMounted at /content/gdrive


In [None]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [None]:
ratings = pd.read_csv(path/"u.data", delimiter='\t', header=None,
                      names=['user', 'movie', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
last_skywalker = np.array([0.98, 0.9, -0.9])
user1 = np.array([0.9, 0.8, -0.6])

In [None]:
(user1*last_skywalker).sum()

2.1420000000000003

In [None]:
casablanca = np.array([-0.99, -0.3, 0.8])

In [None]:
(user1*casablanca).sum()

-1.611

In [None]:
movies = pd.read_csv(path/"u.item", delimiter='|',encoding='latin-1', header=None,
                      names=('movie', 'title'), usecols=(0,1))

In [None]:
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [None]:
movies.shape

(1682, 2)

In [None]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [None]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,13,Blade Runner (1982),4
1,37,Die Hard: With a Vengeance (1995),4
2,135,"Fugitive, The (1993)",3
3,774,Cinderella (1950),2
4,498,Fargo (1996),3
5,807,Die Hard (1988),4
6,577,My Family (1995),4
7,13,Blood Beach (1981),1
8,249,Dead Man Walking (1995),5
9,821,To Kill a Mockingbird (1962),5


In [None]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [None]:
one_hot_3 = one_hot(3, n_users).float()
user_factors.t() @ one_hot_3

tensor([-1.8574,  0.1701,  0.4126,  0.8374, -0.5413])

In [36]:
user_factors[3]

tensor([-1.8574,  0.1701,  0.4126,  0.8374, -0.5413])

In [32]:
class DotProduct(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.y_range = y_range
  def forward(self, x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return sigmoid_range((users*movies).sum(dim=1), *self.y_range)

model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.972037,0.974288,00:08
1,0.858376,0.890013,00:08
2,0.696124,0.855911,00:09
3,0.495908,0.860677,00:08
4,0.364098,0.865246,00:08


In [34]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [35]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.928997,0.932917,00:09
1,0.848287,0.856139,00:09
2,0.629009,0.852597,00:09
3,0.406725,0.875859,00:09
4,0.298798,0.883143,00:09


In [37]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.939521,0.930439,00:09
1,0.830076,0.848529,00:09
2,0.725054,0.812952,00:09
3,0.601716,0.802829,00:09
4,0.479984,0.801628,00:09


In [38]:
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [39]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.movie_factors = create_params([n_movies, n_factors])
        self.movie_bias = create_params([n_movies])
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors[x[:,0]]
        movies = self.movie_factors[x[:,1]]
        res = (users*movies).sum(dim=1)
        res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
        return sigmoid_range(res, *self.y_range)

In [40]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.925336,0.917677,00:09
1,0.832964,0.862518,00:09
2,0.708594,0.816562,00:09
3,0.592604,0.807653,00:09
4,0.492036,0.808579,00:09


In [42]:
movie_bias = learn.model.movie_bias.squeeze()
idxs = movie_bias.argsort()[:5]
[dls.classes['title'][i] for i in idxs]

['Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Children of the Corn: The Gathering (1996)',
 'Amityville II: The Possession (1982)',
 'Crow: City of Angels, The (1996)',
 'Jury Duty (1995)']

In [43]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.937828,0.92845,00:09
1,0.889912,0.863922,00:10
2,0.743556,0.820467,00:10
3,0.588193,0.806527,00:09
4,0.494305,0.807681,00:09


In [44]:
movie_factors = learn.model.i_weight.weight
idx = dls.classes['title'].o2i['Silence of the Lambs, The (1991)']
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]

'Sweet Nothing (1995)'

In [45]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [53]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [54]:
model = CollabNN(*embs)

In [55]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.98078,0.925665,00:11
1,0.902454,0.908065,00:11
2,0.849086,0.864966,00:11
3,0.825956,0.848734,00:11
4,0.786497,0.852516,00:11


In [56]:
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,1.014726,0.990097,00:13
1,0.926388,0.894447,00:13
2,0.901711,0.869466,00:13
3,0.840886,0.845174,00:13
4,0.765564,0.848548,00:13
