<a href="https://colab.research.google.com/github/Evans-tats/recomendation_systems/blob/main/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Collaborative filtering deep dive

In [1]:
from fastai.collab import *
from fastai.tabular.all import *
from fastdownload import download_url

# Download manually since it's a .zip (untar_data is for tarballs)
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
path = download_url(url, dest='.')
path = Path(path).parent/'ml-latest-small'



In [2]:

!unzip *.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
# Load the data
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv(path/'movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
combined = pd.merge(ratings, movies, on='movieId')
combined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
dls = CollabDataLoaders.from_df(combined, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,171,Star Wars: Episode V - The Empire Strikes Back (1980),4.0
1,220,Brokeback Mountain (2005),2.5
2,249,"Usual Suspects, The (1995)",4.0
3,23,Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),3.0
4,560,Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),4.0
5,66,Little Man Tate (1991),4.0
6,249,Dr. Dolittle (1998),3.0
7,325,Coma (1978),4.0
8,51,"Absent-Minded Professor, The (1961)",4.0
9,141,Ex Machina (2015),4.5


In [7]:
n_users = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
n_factors = 5

Collaborative filtering from scratch


In [8]:
#refresh object oriented programming
class Example:
  def __init__(self,a):
    self.a = a
  def say(self,x):
    return f'Hello {self.a}, {x}.'

In [9]:
ex = Example('Tats')
ex.say('youre awsome')

'Hello Tats, youre awsome.'

In [10]:
class DotProduct(Module):
  def __init__(self, n_user, n_movies, n_factors):
    super().__init__()
    self.user_factors = Embedding(n_user, n_factors)
    self.movie_factors = Embedding(n_movies, n_factors)

  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return (users * movies).sum(dim=1)

In [11]:
x,y = dls.one_batch()
x.shape, y.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

In [12]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [13]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,2.426118,2.646618,00:08
1,1.172078,1.681322,00:06
2,0.903355,1.460359,00:07
3,0.700571,1.319316,00:06
4,0.615095,1.29396,00:07


In [14]:
class DotProduct2(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    super().__init__()
    self.user_factors = Embedding(n_users, n_factors)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.y_range = y_range
  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return sigmoid_range((users*movies).sum(dim=1), *self.y_range)


In [15]:
model = DotProduct2(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.815541,0.921404,00:07
1,0.515106,0.868738,00:06
2,0.305198,0.860376,00:11
3,0.22697,0.851852,00:13
4,0.205875,0.850127,00:10


In [16]:
class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.user_bias = Embedding(n_users, 1)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.movie_bias = Embedding(n_movies, 1)
    self.y_range = y_range

  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    res = (users * movies).sum(dim=1, keepdim=True)
    res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
    return sigmoid_range(res, *self.y_range)


In [17]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.703364,0.818282,00:11
1,0.424139,0.780509,00:09
2,0.267737,0.791182,00:08
3,0.185765,0.793953,00:12
4,0.173938,0.794395,00:09


In [18]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.731931,0.823079,00:08
1,0.48171,0.776979,00:07
2,0.348316,0.755816,00:08
3,0.291793,0.743821,00:11
4,0.271136,0.741747,00:09


#creating our own embedding

In [19]:
class T(Module):
  def __init__(self): self.a = torch.ones(3)

L(T().parameters())


(#0) []

In [35]:
class T(Module):
  def __init__(self): self.a = nn.Parameter(torch.ones(3))

L(T().parameters())

(#1) [Parameter containing:
tensor([1., 1., 1.], requires_grad=True)]

In [21]:
class T(Module):
  def __init__(self): self.a  = nn.Linear(1,3, bias=False)

t = T()
L(t.parameters())


(#1) [Parameter containing:
tensor([[ 0.8562],
        [-0.7607],
        [-0.9923]], requires_grad=True)]

In [22]:
type(t.a.weight)

torch.nn.parameter.Parameter

In [36]:
def create_params(size):
  return nn.Parameter(torch.zeros(*size).normal_(0,0.01))


In [39]:
class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    super().__init__()
    self.user_factors = create_params([n_users, n_factors])
    self.user_bias = create_params([n_users])
    self.movie_factors = create_params([n_movies, n_factors])
    self.movie_bias = create_params([n_movies])
    self.y_range = y_range
  def forward(self,x):
    users = self.user_factors[x[:,0]]
    movies = self.movie_factors[x[:,1]]
    res = (users * movies).sum(dim=1, keepdim=True)
    res += self.user_bias[x[:,0]].unsqueeze(1) + self.movie_bias[x[:,1]].unsqueeze(1)
    return sigmoid_range(res, *self.y_range)



In [40]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.737259,0.827873,00:08
1,0.478944,0.772301,00:08
2,0.357931,0.755754,00:08
3,0.279895,0.745007,00:07
4,0.262306,0.742571,00:08


In [50]:
movie_bias = learn.model.movie_bias.squeeze()
movie_bias.shape

torch.Size([9720])

In [58]:
learn.summary()

DotProductBias (Input shape: 64 x 2)
Layer (type)         Output Shape         Param #    Trainable 
                     64 x 1              
DotProductBias                                                 
____________________________________________________________________________

Total params: 0
Total trainable params: 0
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7af21009c180>
Loss function: FlattenedLoss of MSELoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - CastToTensor
  - Recorder
  - ProgressCallback

In [51]:
movie_bias = learn.model.movie_bias
idxs = movie_bias.argsort()[:5]
[dls.classes['title'][i] for i in idxs]

['I Know What You Did Last Summer (1997)',
 'Catwoman (2004)',
 'Teenage Mutant Ninja Turtles III (1993)',
 'Batman & Robin (1997)',
 'Battlefield Earth (2000)']

In [52]:
idxs = movie_bias.argsort()[-5:]
[dls.classes['title'][i] for i in idxs]

['Star Wars: Episode IV - A New Hope (1977)',
 'Full Metal Jacket (1987)',
 'Forrest Gump (1994)',
 'Dark Knight, The (2008)',
 'Shawshank Redemption, The (1994)']