# Content-Based Filtering

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dot
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredError

## Function
$
F(v_{u}^{j},  v_{m}^{i}) = v_{u}^{j}.v_{m}^{i}
$
<br>
where $v_{u}^{j}$ is vector of users calculated from $X_{u}^{j}$ and also $v_{m}^{i}$ is a vector calculated from $X_{m}^{i}$

## Cost function
Cost function is actually Mean squared error of the function<br>
$
J(v_{u}, v_{m}) = \displaystyle\sum_{(i, j):r(i, j)=1} (v_{u}^{j}.v_{m}^{i} - y^{(i, j)})^2
$

## Getting Data ready

In [2]:
user_df = pd.read_csv('./ratings.csv')
user_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movie_ratings = user_df.groupby('movieId').mean()['rating']
movie_times_rated = user_df.movieId.value_counts()

In [4]:
movies_df = pd.read_csv('./movies.csv')
movies_df.shape

(9742, 3)

In [5]:
movies_df.genres = movies_df.genres.str.split('|')
movies_df = movies_df.explode('genres')
movies_df = pd.get_dummies(data=movies_df, columns=['genres'], drop_first=True)
movies_df = movies_df.groupby(['title', 'movieId'], as_index=False).sum()
movies_df.set_index('movieId', inplace=True)
movies_df = movies_df.sort_index()
movies_df['average_rating'] = movie_ratings
movies_df['times_rated'] = movie_times_rated
movie_year_df = movies_df.title.str.extract(r'(?P<year>[\d]{4})')
movies_df = pd.concat([movies_df, movie_year_df], axis=1)
movies_df = movies_df.sort_index()
movies_df.year = movies_df.year.fillna('0').astype(np.int32)
movies_df.head(10)

Unnamed: 0_level_0,title,genres_Action,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Fantasy,...,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War,genres_Western,average_rating,times_rated,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,3.92093,215.0,1995
2,Jumanji (1995),0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,3.431818,110.0,1995
3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,3.259615,52.0,1995
4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,2.357143,7.0,1995
5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,3.071429,49.0,1995
6,Heat (1995),1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,3.946078,102.0,1995
7,Sabrina (1995),0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,3.185185,54.0,1995
8,Tom and Huck (1995),0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2.875,8.0,1995
9,Sudden Death (1995),1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.125,16.0,1995
10,GoldenEye (1995),1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,3.496212,132.0,1995


In [6]:
user_df.set_index('movieId', inplace=True)
user_df = user_df.join(movies_df.iloc[:, 1:-4])
user_df = user_df.drop('timestamp', axis=1)
def func(x):
    x.iloc[2:] = x.iloc[2:] * x['rating']
    return x
user_df_update = user_df.apply(lambda x: func(x), axis=1)
user_df_update = user_df_update.reset_index().groupby(['userId', 'movieId'])[user_df.columns[1:]].sum()

In [7]:
user_df = user_df_update.reset_index()

In [8]:
movies_df = movies_df.loc[user_df.movieId].reset_index()
titles = movies_df.set_index('movieId')['title']
movies_df = movies_df.drop('title', axis=1)

In [9]:
user_df

Unnamed: 0,userId,movieId,rating,genres_Action,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,genres_Crime,genres_Documentary,...,genres_Fantasy,genres_Film-Noir,genres_Horror,genres_IMAX,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War
0,1.0,1,4.0,0.0,4.0,4.0,4.0,4.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,3,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
2,1.0,6,4.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,1.0,47,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0
4,1.0,50,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610.0,166534,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
100832,610.0,168248,5.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
100833,610.0,168250,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100834,610.0,168252,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0


In [10]:
movies_df

Unnamed: 0,movieId,genres_Action,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Fantasy,...,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War,genres_Western,average_rating,times_rated,year
0,1,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,3.920930,215.0,1995
1,3,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,3.259615,52.0,1995
2,6,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,3.946078,102.0,1995
3,47,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,3.975369,203.0,1995
4,50,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,4.237745,204.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,166534,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,3.333333,6.0,2017
100832,168248,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,4.142857,7.0,2017
100833,168250,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.633333,15.0,2017
100834,168252,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.280000,25.0,2017


## Training the data

In [11]:
u_s = 3 # start of the user set to use on training
i_s = 1 # start of the item set to use on training
X_u = user_df.values
X_i = movies_df.values
y = user_df['rating'].values

In [12]:
print(f'shape of X_u is {X_u.shape}')
print(f'shape of X_i is {X_i.shape}')
print(f'shape of y is {y.shape}')

shape of X_u is (100836, 21)
shape of X_i is (100836, 23)
shape of y is (100836,)


In [13]:
X_u

array([[1.00000e+00, 1.00000e+00, 4.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.00000e+00, 3.00000e+00, 4.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.00000e+00, 6.00000e+00, 4.00000e+00, ..., 0.00000e+00,
        4.00000e+00, 0.00000e+00],
       ...,
       [6.10000e+02, 1.68250e+05, 5.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [6.10000e+02, 1.68252e+05, 5.00000e+00, ..., 5.00000e+00,
        0.00000e+00, 0.00000e+00],
       [6.10000e+02, 1.70875e+05, 3.00000e+00, ..., 0.00000e+00,
        3.00000e+00, 0.00000e+00]])

In [14]:
X_u_scalar = StandardScaler()
X_u_scalar = X_u_scalar.fit(X_u)
X_u_norm = X_u_scalar.transform(X_u)

X_i_scalar = StandardScaler()
X_i_scalar = X_i_scalar.fit(X_i)
X_i_norm = X_i_scalar.transform(X_i)

y_min_max = MinMaxScaler((-1, 1))
y_min_max = y_min_max.fit(y.reshape(-1, 1))
y_norm = y_min_max.transform(y.reshape(-1, 1))

In [15]:
user_train, user_test = train_test_split(X_u_norm, train_size=0.8, shuffle=True, random_state=1)
item_train, item_test = train_test_split(X_i_norm, train_size=0.8, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_norm, train_size=0.8, shuffle=True, random_state=1)

print(f'shape of user_train is {user_train.shape} and shape of user_test is {user_test.shape}')
print(f'shape of item_train is {item_train.shape} and shape of item_test is {item_test.shape}')
print(f'shape of y_train is {y_train.shape} and shape of y_test is {y_test.shape}')

shape of user_train is (80668, 21) and shape of user_test is (20168, 21)
shape of item_train is (80668, 23) and shape of item_test is (20168, 23)
shape of y_train is (80668, 1) and shape of y_test is (20168, 1)


In [16]:
number_of_outputs = 32
num_user_features = user_train[:, u_s:].shape[1]
num_item_features = item_train[:, i_s:].shape[1]

user_NN = Sequential([
    Dense(units=256, activation='relu'),
    Dense(units=128, activation='relu'),
    Dense(units=number_of_outputs)
])

movie_NN = Sequential([
    Dense(units=256, activation='relu'),
    Dense(units=128, activation='relu'),
    Dense(units=number_of_outputs)
])

user_input = tf.keras.Input(shape=(num_user_features,))
vu = user_NN(user_input)
vu = tf.linalg.l2_normalize(vu, axis=1)

item_input = tf.keras.Input(shape=(num_item_features,))
vm = movie_NN(item_input)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = Dot(axes=1)([vu, vm])

model = tf.keras.Model([user_input, item_input], output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 18)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           41888       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42912       ['input_2[0][0]']                
                                                                                              

In [17]:
model.compile(
    loss = MeanSquaredError(),
    optimizer = tf.keras.optimizers.Adam(0.01)
)

model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x23b7418bd30>

In [18]:
new_userId = 123456
new_movieId = 111111
rating = 0
genres_Action = 5
genres_Adventure = 0 
genres_Animation = 3.5
genres_Children = 0
genres_Comedy = 0
genres_Crime = 4.5
genres_Documentary = 3.5
genres_Drama = 4
genres_Fantasy = 0
genres_Film_Noir = 0
genres_Horror = 0
genres_IMAX = 0
genres_Musical = 0 
genres_Mystery = 0
genres_Romance = 4
genres_SciFi = 0
genres_Thriller = 0
genres_War = 0

new_user_vec = np.array([[new_userId, new_movieId, rating, genres_Action, genres_Adventure, genres_Animation, 
                          genres_Children, genres_Comedy, genres_Crime, genres_Documentary, genres_Drama, 
                          genres_Fantasy, genres_Film_Noir, genres_Horror, genres_IMAX, genres_Musical, 
                          genres_Mystery, genres_Romance, genres_SciFi, genres_Thriller, genres_War]])

new_user_vec.shape

(1, 21)

In [19]:
new_user_vec_norm = X_u_scalar.transform(new_user_vec)

length = item_test.shape[0]
user_vec_norm = np.array(new_user_vec_norm)
for i in range(length - 1):
    user_vec_norm = np.concatenate([user_vec_norm, new_user_vec_norm])

In [20]:
y_hat = model.predict([user_vec_norm[:, u_s:], item_test[:, i_s:]])



In [21]:
y_hat = y_min_max.inverse_transform(y_hat)

In [22]:
selected_movie = np.argwhere(y_hat.flatten() >= 4.6)

In [23]:
movieIds = X_i_scalar.inverse_transform(item_test)[selected_movie]
movieIds = movieIds[:, :, 0]
movieIds = movieIds.flatten()

In [24]:
suggests = movies_df[movies_df['movieId'].isin(movieIds)].set_index('movieId')
suggests = suggests[~suggests.index.duplicated(keep='first')]
movieIds = suggests.index

In [25]:
titles_suggest = pd.DataFrame(titles.loc[movieIds])
titles_suggest = titles_suggest[~titles_suggest.index.duplicated(keep='first')]
titles_suggest

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
1,Toy Story (1995)
6,Heat (1995)
47,Seven (a.k.a. Se7en) (1995)
50,"Usual Suspects, The (1995)"
70,From Dusk Till Dawn (1996)
...,...
113862,"Guest, The (2014)"
117867,'71 (2014)
128838,Crimson Peak (2015)
130052,Clown (2014)
