# Movies Recommendation Using Content Based Filtering
# Outline
- [ 1- Packages](#1--importing-required-packages)
- [ 2- Datasets](#2-importing-datasets)
- [ 3- Preparing the data](#3--data-preperation)
- [ 4- Neural Network](#4-neural-network-for-content-based-filtering)
- [ Saving the model](#5--saving-trained-model)

# 1- Importing required packages.

In [124]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# 2-Importing datasets
The data set is processed from the [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/) dataset.
The **movie dataset** provided to the network is a combination of the original data and some 'engineered features'. The original features are the year the movie was released and the movie's genre's presented as a one-hot vector. There are 14 genres. The engineered feature is an average rating derived from the user ratings.

The **user content** is composed of engineered features. A per genre average rating is computed per user. Additionally, a user id, rating count and rating average are available but not included in the training or prediction content. They are carried with the data set because they are useful in interpreting data.
The **y** dataset contains movies ratings.

In [140]:
user_df=pd.read_csv('../input/users.csv')
movie_df=pd.read_csv('../input/movies.csv')
y=pd.read_csv('../input/ratings.csv')

num_user_features = user_df.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = movie_df.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

user_df.head()

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
1,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
2,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
3,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
4,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89


In [126]:
movie_df.head()

Unnamed: 0,movie id,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874,2003,3.961832,1,0,0,0,0,1,0,0,0,0,0,0,0,1
1,8798,2004,3.761364,1,0,0,0,0,1,0,1,0,0,0,0,0,1
2,46970,2006,3.25,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3,48516,2006,4.252336,0,0,0,0,0,1,0,1,0,0,0,0,0,1
4,58559,2008,4.238255,1,0,0,0,0,1,0,1,0,0,0,0,0,0


In [127]:
y.head()

Unnamed: 0,rating
0,4.0
1,3.5
2,4.0
3,4.0
4,4.5


# 3- Data Preperation
Scalling the numerical features using StandardScaler.


In [141]:
user_df_unscaled=user_df.copy()
movie_df_unscaled=movie_df.copy()
y_unscaled=y.copy()
user_df=np.array(user_df)
movie_df=np.array(movie_df)
y=np.array(y)

user_scaler=StandardScaler()
movie_scaler=StandardScaler()
y_scaler=MinMaxScaler((-1,1))

user_scaler.fit(user_df)
movie_scaler.fit(movie_df)
y_scaler.fit(np.array(y).reshape(-1,1))

user_df=pd.DataFrame(user_scaler.transform(user_df),columns=user_df_unscaled.columns)
movie_df=pd.DataFrame(movie_scaler.transform(movie_df),columns=movie_df_unscaled.columns)
y=y_scaler.transform(y.reshape(-1,1))



Splitting the datasets into training and testing data.

In [142]:
user_train,user_test=train_test_split(user_df,test_size=0.3,shuffle=True,random_state=123)
movie_train,movie_test=train_test_split(movie_df,test_size=0.3,shuffle=True,random_state=123)
y_train,y_test=train_test_split(y,test_size=0.3,shuffle=True,random_state=123)


# 4-Neural Network for Content Based Filtering

In [143]:
# GRADED_CELL
# UNQ_C1

num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###
keras.layers.Dense(units=256,activation='relu'),
  keras.layers.Dense(units=128,activation='relu'),
  keras.layers.Dense(units=num_outputs,activation='linear')

    ### END CODE HERE ###
])

item_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###
  keras.layers.Dense(units=256,activation='relu'),
  keras.layers.Dense(units=128,activation='relu'),
  keras.layers.Dense(units=num_outputs,activation='linear')
    ### END CODE HERE ###
])

# create the user input and point to the base network
input_user = keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = keras.Model([input_user, input_item], output)

model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 14)]         0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, 16)]         0           []                               
                                                                                                  
 sequential_14 (Sequential)     (None, 32)           40864       ['input_10[0][0]']               
                                                                                                  
 sequential_15 (Sequential)     (None, 32)           41376       ['input_11[0][0]']               
                                                                                            

In [144]:
tf.random.set_seed(1)
cost_fn = keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [145]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], movie_train[:, i_s:]], y_train, epochs=30)

InvalidIndexError: (slice(None, None, None), slice(3, None, None))

In [134]:
model.evaluate([user_test[:, u_s:], movie_test[:, i_s:]], y_test)



0.08295433223247528

# 5- Saving trained models

In [136]:
model.save('../models/my_model')

INFO:tensorflow:Assets written to: ../models/my_model\assets


In [146]:
from joblib import dump, load
dump(user_scaler,'../models/userScaler.bin',compress=True)
dump(movie_scaler,'../models/movieScaler.bin',compress=True)
dump(y_scaler,'../models/targetScaler.bin',compress=True)

['../models/targetScaler.bin']

In [None]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [None]:
user_vec=pd.DataFrame(user_vec)
user_vec=user_vec.loc[user_vec.index.repeat(5)]

In [None]:
user_vec.head()

In [None]:
x=np.array([[1,23,4]])
print(np.repeat(x,3,axis=0))