# Movie recommender

In [1]:
# download data
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip

--2021-01-22 15:41:17--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2021-01-22 15:41:19 (98.5 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [2]:
# unzip data
!unzip -n ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [4]:
# list files
!ls "ml-20m"

genome-scores.csv  links.csv   ratings.csv  tags.csv
genome-tags.csv    movies.csv  README.txt


In [6]:
# load dataframe
import pandas as pd

df = pd.read_csv("ml-20m/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


### Preprocessing

In [7]:
# userid space and movieid space have to in range 0 to n-1
# we can't assume that this is the case
df.userId = pd.Categorical(df.userId)
df['new_user_id'] = df.userId.cat.codes

In [12]:
df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes

In [21]:
# seperate data to NUMPY-ARRAYS (not pandas Series)
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values

In [22]:
# Get Number of users and movies
U = len(user_ids)
M = len(movie_ids)

U,M

(20000263, 20000263)

In [23]:
# embedding dimension
K = 10

### Neural Network

In [34]:
import tensorflow as tf
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation

""" Inputs """
# User Input
u_in = Input(shape=(1,))

# Movies Input
m_in = Input(shape=(1,))

""" Embedding """
# User embedding
u_emb = Embedding(U, K)(u_in) # output (num_samples, 1, K)

# Movie embedding
m_emb = Embedding(M, K)(m_in) # output (num_samples, 1, K)

# Flatten Embeddings
u_emb = Flatten()(u_emb) # output (num_samples, K)
m_emb = Flatten()(m_emb) # output (num_samples, K)

""" Conatenate """
x = Concatenate()([u_emb, m_emb]) # output (num_samples, 2*K)

""" Feedforward Net """
x = Dense(1024)(x)
x = Activation("relu")(x)
out = Dense(1)(x) # No activation, because we predict the rating

In [43]:
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import Model

# compiling
model = Model(inputs=[u_in, m_in], outputs=out)
model.compile(
    loss="mse",
    optimizer=SGD(lr=0.08, momentum=0.9),
)

### Training

In [38]:
from sklearn.utils import shuffle

# splitting the data
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings) # shuffe
Ntrain = int(0.8*len(ratings)) # Ntrain is 80% of data

# actual splotting
train_user = user_ids[:Ntrain] # first 80% of users - [Start:End]
train_movie = movie_ids[:Ntrain]
train_rating = ratings[:Ntrain]

test_user = user_ids[Ntrain:]
test_movie = movie_ids[Ntrain:]
test_rating = ratings[Ntrain:]

In [40]:
# center the ratings
avg_rating = train_rating.mean()
train_rating = train_rating - avg_rating
test_rating = test_rating - avg_rating

In [None]:
hist = model.fit(
  x=[train_user, train_movie],
  y=train_rating,
  epochs=25,
  batch_size=1024,
  verbose=1,
  validation_data=([test_user, test_movie], test_rating),
)

Epoch 1/25