In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import requests

In [2]:
# API url 
url="http://127.0.0.1:5000/api/v1.0/movies_list"

In [3]:
# Call the api and retrieve data
response=requests.get(url)

In [4]:
# Checking if the request was successful
if response.status_code == 200:
    # Convert the JSON response to a pandas DataFrame
    movies_combined_df = pd.DataFrame(response.json())
else:
    print("Failed to retrieve data:", response.status_code)

In [5]:
# Checking results of data requeset
movies_combined_df.head()

Unnamed: 0,age,age_desc,cleaned_genres,gender,movieId,occ_desc,occupation,poster_path,rating,timestamp,title,userId,zipcode
0,18,18-24,Action|Crime|Drama|Thriller,F,949,college/grad student,4,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,5.0,956688825,Heat,242,53706
1,18,18-24,Action|Crime|Drama|Thriller,M,949,college/grad student,4,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,3.0,898008327,Heat,311,31201
2,25,25-34,Action|Crime|Drama|Thriller,M,949,writer,20,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,2.0,955092697,Heat,232,55408
3,35,35-44,Action|Crime|Drama|Thriller,F,949,executive/managerial,7,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,5.0,974670478,Heat,387,55111
4,35,35-44,Action|Crime|Drama|Thriller,M,949,technician/engineer,17,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,3.5,1340405089,Heat,505,37815


In [6]:
# Check the number of ratings, unique movieId's, unique users, and average ratings per user and movie.
n_ratings = len(movies_combined_df)
n_movies = len(movies_combined_df['movieId'].unique())
n_users = len(movies_combined_df['userId'].unique())
 
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 43000
Number of unique movieId's: 2615
Number of unique users: 671
Average ratings per user: 64.08
Average ratings per movie: 16.44


In [7]:
# Select columns and drop duplicates
movies_df = movies_combined_df[['movieId', 'title', 'cleaned_genres']].copy()
movies_df = movies_df.drop_duplicates()
movies_df.head()

Unnamed: 0,movieId,title,cleaned_genres
0,949,Heat,Action|Crime|Drama|Thriller
13,710,GoldenEye,Adventure|Action|Thriller
15,1408,Cutthroat Island,Action|Adventure
56,524,Casino,Drama|Crime
96,4584,Sense and Sensibility,Drama|Romance


In [8]:
# Check the number of ratings and unique movieId's
n_ratings = len(movies_df)
n_movies = len(movies_df['movieId'].unique())

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")


Number of ratings: 2615
Number of unique movieId's: 2615


In [9]:
# Selcet columns
ratings_df = movies_combined_df[['userId', 'movieId', 'rating']].copy()
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,242,949,5.0
1,311,949,3.0
2,232,949,2.0
3,387,949,5.0
4,505,949,3.5


In [10]:
# Check the number of ratings, unique movieId's, and unique users
n_ratings = len(ratings_df)
n_movies = len(ratings_df['movieId'].unique())
n_users = len(ratings_df['userId'].unique())
 
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")


Number of ratings: 43000
Number of unique movieId's: 2615
Number of unique users: 671


In [11]:
# Select columns and drop duplicates
users_df = movies_combined_df[['userId', 'gender', 'zipcode', 'age_desc', 'occ_desc']] .copy()
users_df = users_df.drop_duplicates()
users_df.head()

Unnamed: 0,userId,gender,zipcode,age_desc,occ_desc
0,242,F,53706,18-24,college/grad student
1,311,M,31201,18-24,college/grad student
2,232,M,55408,25-34,writer
3,387,F,55111,35-44,executive/managerial
4,505,M,37815,35-44,technician/engineer


In [12]:
# Check the number of ratings and unique users
n_ratings = len(users_df)
n_users = len(users_df['userId'].unique())
 
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique users: {n_users}")


Number of ratings: 671
Number of unique users: 671


In [13]:
# Training set randomized options
import random
import math
RNG_SEED = 142
random_ratings = ratings_df.sample(frac=1, random_state=RNG_SEED)

# Randomize the dataframes
users = random_ratings['userId'].values
movies = random_ratings['movieId'].values
ratings = random_ratings['rating'].values

print(f"users:", users, ', shape =', users.shape)
print(f"movies:", movies, ', shape =', movies.shape)
print(f"ratings:", ratings, ', shape =', ratings.shape)

users: [245  48 502 ... 475 468 242] , shape = (43000,)
movies: [ 4857 33166   605 ... 41566 52767   300] , shape = (43000,)
ratings: [2.  3.5 3.  ... 3.  4.5 4. ] , shape = (43000,)


In [14]:
import tensorflow as tf
class CFModel(tf.keras.Model):
    def __init__(self, n_users, m_items, k_factors):
        super(CFModel, self).__init__()
        
        self.P = tf.keras.Sequential([
            tf.keras.layers.Embedding(n_users, k_factors, input_length=1),
            tf.keras.layers.Reshape((k_factors,))
        ])
        
        self.Q = tf.keras.Sequential([
            tf.keras.layers.Embedding(m_items, k_factors, input_length=1),
            tf.keras.layers.Reshape((k_factors,))
        ])
        
    def call(self, inputs):
        user_id, item_id = inputs
        user_latent = self.P(user_id)
        item_latent = self.Q(item_id)
        return tf.reduce_sum(tf.multiply(user_latent, item_latent), axis=1)
    
    def rate(self, user_id, item_id):
        user_embedding = self.P(tf.constant([user_id]))
        item_embedding = self.Q(tf.constant([item_id]))
        prediction = tf.reduce_sum(tf.multiply(user_embedding, item_embedding), axis=1)[0]
        return prediction.numpy()  


In [15]:
# Capture the max userId and movieId
user_id_max = ratings_df['userId'].drop_duplicates().max()
movie_id_max = ratings_df['movieId'].drop_duplicates().max()

In [16]:
# Ensure user_id and movie_id fall within range
n_users = user_id_max + 1
m_items = movie_id_max + 1

In [17]:
# Test with constant 
FACTORS = 100

In [18]:
# Colabritive filtering model
cf_model = CFModel(n_users, m_items, FACTORS)

# Compile, loss: Mean Squared Error, opimizer: Adamax
cf_model.compile(loss='mse', optimizer='adamax')

In [19]:
# Ensure compatibility with model
print("Max User ID:", user_id_max)
print("Max Movie ID:", movie_id_max)
print("Number of users (n_users):", n_users)
print("Number of items (m_items):", m_items)

Max User ID: 671
Max Movie ID: 160718
Number of users (n_users): 672
Number of items (m_items): 160719


In [20]:
# Train the model
# Set callbacks to monitor validation loss and save the best model weights
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2),
]

# Use epochs for training
epochs = 60

# Fit the model with callbacks
results = cf_model.fit(
    x=[users, movies],
    y=ratings,
    epochs=epochs,
    validation_split=.1,
    verbose=2,
    callbacks=callbacks
)

# Save the entire model using the TensorFlow SavedModel format
tf.keras.models.save_model(cf_model, 'saved_model')


Epoch 1/60
1210/1210 - 1s - loss: 13.7575 - val_loss: 13.7273 - 1s/epoch - 883us/step
Epoch 2/60
1210/1210 - 1s - loss: 13.7209 - val_loss: 13.6844 - 644ms/epoch - 533us/step
Epoch 3/60
1210/1210 - 1s - loss: 13.5917 - val_loss: 13.4797 - 668ms/epoch - 552us/step
Epoch 4/60
1210/1210 - 1s - loss: 13.2007 - val_loss: 12.9203 - 707ms/epoch - 585us/step
Epoch 5/60
1210/1210 - 1s - loss: 12.3765 - val_loss: 11.8845 - 628ms/epoch - 519us/step
Epoch 6/60
1210/1210 - 1s - loss: 11.0925 - val_loss: 10.4549 - 655ms/epoch - 541us/step
Epoch 7/60
1210/1210 - 1s - loss: 9.5504 - val_loss: 8.9375 - 686ms/epoch - 567us/step
Epoch 8/60
1210/1210 - 1s - loss: 8.1254 - val_loss: 7.6847 - 652ms/epoch - 539us/step
Epoch 9/60
1210/1210 - 1s - loss: 6.9997 - val_loss: 6.7109 - 634ms/epoch - 524us/step
Epoch 10/60
1210/1210 - 1s - loss: 6.1138 - val_loss: 5.9393 - 635ms/epoch - 524us/step
Epoch 11/60
1210/1210 - 1s - loss: 5.3965 - val_loss: 5.3128 - 731ms/epoch - 604us/step
Epoch 12/60
1210/1210 - 1s - los

INFO:tensorflow:Assets written to: saved_model/assets


In [21]:
# Show the best validation RMSE
val_losses = results.history['val_loss']
min_val_loss = min(val_losses)
idx = val_losses.index(min_val_loss) + 1  # Add 1 to get epoch number starting from 1
print('Minimum RMSE at epoch', idx, '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 60 = 1.1242


In [39]:
# Select user
USER = 660

In [40]:
users_df[users_df['userId'] == USER]

Unnamed: 0,userId,gender,zipcode,age_desc,occ_desc
3511,660,M,70507,45-49,self-employed


In [41]:
# Define predicted rating
def predict_rating(userId, movieId):
    return cf_model.rate(userId - 1, movieId - 1)

In [42]:
# Get the top rated movies by current user
user_ratings = ratings_df[ratings_df["userId"] == USER][['userId', 'movieId', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(USER, x['movieId']), axis=1)

# Remove duplicate movie entries from movies_df DataFrame
unique_movies_df = movies_df.drop_duplicates(subset=['movieId'])

# Merge the user_ratings DataFrame with the unique_movies_df DataFrame to get the top 10 movies by current user.
merged_data = user_ratings.merge(unique_movies_df, on='movieId', how='inner')

# Sort the DataFrame by rating in descending order and reset the index.
top_10_rated_movies = merged_data.sort_values(by='rating', ascending=False)
top_10_rated_movies = top_10_rated_movies.reset_index(drop=True)

# Display the top 10 rated movies by current user.
top_10_rated_movies.head(10)


Unnamed: 0,userId,movieId,rating,prediction,title,cleaned_genres
0,660,260,4.5,2.490349,The 39 Steps,Action|Thriller|Mystery
1,660,40815,4.5,-0.043317,On Guard,Drama|Adventure
2,660,81847,4.5,0.05115,The Dawn Patrol,Action|War|Drama
3,660,111759,4.5,0.096745,Don Q Son of Zorro,Western|Adventure|Romance
4,660,54001,4.5,0.00689,The Traveler,Drama
5,660,8970,4.0,0.170644,The Out-of-Towners,Comedy
6,660,4993,4.0,2.362629,5 Card Stud,Action|Western|Thriller
7,660,8961,4.0,-0.062767,Bad Boys II,Adventure|Action|Comedy|Thriller|Crime
8,660,30707,4.0,-0.068407,Star 80,Drama
9,660,77561,4.0,0.135101,EVA,Science Fiction


In [43]:
# Get unrated movies for the user
recommendations = ratings_df[ratings_df['movieId'].isin(user_ratings['movieId'])== False][['movieId']].drop_duplicates()

# Generate predictions for unrated movies
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(USER, x['movieId']), axis=1)

# Merge predictions with movie information
recommended_movies = recommendations.sort_values(by='prediction', ascending=False).merge(movies_df,
                                                                                         on='movieId',
                                                                                         how='inner',
                                                                                         suffixes=['_u', '_m'])

# Filter out duplicate titles for one specific user
recommended_movies = recommended_movies.drop_duplicates(subset=['title'])

# Reset index
top_10_recommended_movies = recommended_movies.reset_index(drop=True)

# Display the top 20 recommended movies
top_10_recommended_movies.head(10)

Unnamed: 0,movieId,prediction,title,cleaned_genres
0,609,4.382645,Poltergeist,Horror
1,319,4.292805,True Romance,Action|Thriller|Crime|Romance
2,476,4.139603,Drugstore Cowboy,Crime|Drama
3,594,4.126383,The Terminal,Comedy|Drama
4,746,4.088197,The Last Emperor,Drama|History
5,1255,4.041733,The Host,Horror|Drama|Science Fiction
6,927,4.025468,Gremlins,Fantasy|Horror|Comedy
7,899,4.022752,Broken Blossoms,Drama|Romance
8,308,4.008721,Broken Flowers,Comedy|Drama|Mystery|Romance
9,195,4.007994,Trouble in Paradise,Comedy|Romance
