In [249]:
# lib
import numpy as np
import pandas as pd

from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.prediction_algorithms.knns import KNNBasic

In [250]:
# load dataset
movies_metadata = pd.read_csv('data/Movie/movies_metadata.csv')
ratings = pd.read_csv('data/Movie/ratings_small.csv')

  movies_metadata = pd.read_csv('data/Movie/movies_metadata.csv')


In [251]:
# Data Preprocessing and Cleaning
movies_metadata = movies_metadata[movies_metadata['id'].str.isnumeric()]
movies_metadata = movies_metadata.drop_duplicates()
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')

ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')

# Drop NaN values
movies_metadata.dropna(subset=['id'], inplace=True)
ratings.dropna(subset=['movieId'], inplace=True)

# Check
print(movies_metadata.shape, ratings.shape)

(45450, 24) (100004, 4)


In [252]:
# To ensure the quality of recommendation
# we select movies with votes more than 50
movie_md = movies_metadata[movies_metadata['vote_count'] > 50][['id', 'title']]

# ID
movie_ids = [int(x) for x in movie_md['id'].values]

# Select ratings of movies with more than 50 counts
ratings = ratings[ratings['movieId'].isin(movie_ids)]

# Reset Index
ratings.reset_index(inplace=True, drop=True)

# Check
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1371,2.5,1260759135
1,1,2105,4.0,1260759139
2,1,2294,2.0,1260759108
3,2,17,5.0,835355681
4,2,62,3.0,835355749


In [253]:
movie_md.head(5)

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
4,11862,Father of the Bride Part II
5,949,Heat


In [254]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)

# Split the dataset into the train and test set
trainset, testset = train_test_split(data, test_size=0.2)

In [255]:
# Initialize model
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2152a1c6470>

In [256]:
def get_recommendations(data, movie_md, user_id, top_n, algo):
    recommendations = []
    
    # creating an user-item interactions matrix
    user_movie_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting product ids which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product ids which user_id has not interacted yet
    for item_id in non_interacted_movies:
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_id).est
        
        # appending the predicted ratings
        # movie_name = movies_metadata[movies_metadata['movieId']==str(item_id)]['title']
        movie_info = movie_md[movie_md['id'] == item_id]['title']
        movie_name = movie_info.iloc[0] if not movie_info.empty else 'Unknown Movie'
        recommendations.append((movie_name, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returing top n highest predicted rating products for this user

In [257]:
get_recommendations(data=ratings, movie_md=movies_metadata, user_id=5, top_n=10, algo=svd)

[('Madagascar', 4.589087958148596),
 ('Galaxy Quest', 4.552346446780332),
 ('Birdman of Alcatraz', 4.548389244120531),
 ('Mission: Impossible', 4.524467166267829),
 ('Terminator 3: Rise of the Machines', 4.5139128395751715),
 ('Laura', 4.484267840947472),
 ('Men in Black II', 4.480676826628095),
 ('Terminator Salvation', 4.476362915175064),
 ('Irma la Douce', 4.475360866724564),
 ('Once Were Warriors', 4.472349017507658)]

### User-Based

In [258]:
sim_options = {'name': 'cosine',
               'user_based': True}

# KNN algorithm to find similar items
sim_user = KNNBasic(sim_options=sim_options, verbose=False, random_state=50)

# Train on the trainset, and predict ratings for the testset
sim_user.fit(trainset)

<surprise.prediction_algorithms.knns.KNNBasic at 0x2152a1c7490>

In [259]:
get_recommendations(ratings, movie_md, 10, 10, sim_user)

[('Rio Bravo', 5),
 ('The Celebration', 5),
 ('Grease', 5),
 ('A Streetcar Named Desire', 5),
 ('The Evil Dead', 5),
 ('Strangers on a Train', 5),
 ("Singin' in the Rain", 5),
 ('The General', 5),
 ('Once Upon a Time in Mexico', 5),
 ('Captain America: The First Avenger', 5)]

In [260]:
predictions_user_based = sim_user.test(testset)
rmse_user_based = accuracy.rmse(predictions_user_based)

RMSE: 0.9673


### Item-Based

In [261]:
sim_options = {'name': 'cosine',
               'user_based': False}

# KNN algorithm to find similar items
sim_item = KNNBasic(sim_options=sim_options, verbose=False, random_state=50)

# Train on the trainset, and predict ratings for the testset
sim_item.fit(trainset)

<surprise.prediction_algorithms.knns.KNNBasic at 0x2152a1c6260>

In [262]:
get_recommendations(ratings, movie_md, 10, 10, sim_item)

[('A History of Violence', 5),
 ('Star Trek: Insurrection', 5),
 ('Scoop', 5),
 ('The Running Man', 5),
 ('Fear and Loathing in Las Vegas', 5),
 ('In Good Company', 5),
 ('Identity', 5),
 ('The Man Who Knew Too Much', 5),
 ('American Wedding', 5),
 ('Taking Woodstock', 5)]

In [263]:
predictions_item_based = sim_item.test(testset)
rmse_item_based = accuracy.rmse(predictions_item_based)

RMSE: 0.9906
