<a href="https://colab.research.google.com/github/Evans-tats/recomendation_systems/blob/main/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Collaborative filtering deep dive

In [1]:
from fastai.collab import *
from fastai.tabular.all import *
from fastdownload import download_url

# Download manually since it's a .zip (untar_data is for tarballs)
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
path = download_url(url, dest='.')
path = Path(path).parent/'ml-latest-small'



In [2]:

!unzip *.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
# Load the data
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv(path/'movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
combined = pd.merge(ratings, movies, on='movieId')
combined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
dls = CollabDataLoaders.from_df(combined, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,599,"Cowboy Way, The (1994)",2.0
1,284,Nine Months (1995),4.0
2,153,Finding Nemo (2003),1.0
3,195,"Breakfast Club, The (1985)",3.0
4,19,Ghost (1990),3.0
5,313,Being John Malkovich (1999),4.0
6,356,Minority Report (2002),3.5
7,441,Bill & Ted's Excellent Adventure (1989),5.0
8,580,"Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)",3.5
9,298,"Departed, The (2006)",3.0


In [7]:
n_users = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
n_factors = 5

Collaborative filtering from scratch


In [8]:
#refresh object oriented programming
class Example:
  def __init__(self,a):
    self.a = a
  def say(self,x):
    return f'Hello {self.a}, {x}.'

In [9]:
ex = Example('Tats')
ex.say('youre awsome')

'Hello Tats, youre awsome.'

In [10]:
class DotProduct(Module):
  def __init__(self, n_user, n_movies, n_factors):
    super().__init__()
    self.user_factors = Embedding(n_user, n_factors)
    self.movie_factors = Embedding(n_movies, n_factors)

  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return (users * movies).sum(dim=1)

In [11]:
x,y = dls.one_batch()
x.shape, y.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

In [12]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [13]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,2.467746,2.581581,00:07
1,1.211116,1.704333,00:06
2,0.839409,1.461497,00:06
3,0.682365,1.347332,00:06
4,0.654557,1.316518,00:06


In [14]:
class DotProduct2(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    super().__init__()
    self.user_factors = Embedding(n_users, n_factors)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.y_range = y_range
  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return sigmoid_range((users*movies).sum(dim=1), *self.y_range)


In [15]:
model = DotProduct2(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.776552,0.916033,00:06
1,0.483882,0.864772,00:07
2,0.301781,0.864207,00:06
3,0.232798,0.861819,00:06
4,0.200138,0.860073,00:08


In [16]:
class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.user_bias = Embedding(n_users, 1)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.movie_bias = Embedding(n_movies, 1)
    self.y_range = y_range

  def forward(self,x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    res = (users * movies).sum(dim=1, keepdim=True)
    res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
    return sigmoid_range(res, *self.y_range)


In [17]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.724622,0.828244,00:07
1,0.467966,0.797958,00:07
2,0.264383,0.79987,00:07
3,0.190002,0.801511,00:07
4,0.170828,0.800741,00:07
