This model combines the recommendations generated from content-based, collaborative filtering and SVD model. The hybrid model overcomes the shortcomings of individual models and improves the diversity of the recommendations

In [1]:
!pip install surprise

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
import pandas as pd
import numpy as np

In [3]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import dump

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [5]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [6]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [7]:
testdf.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"['Adventure', 'Animation', 'Children', 'Comedy...",[]
1,1,3,4.0,964981247,"['Comedy', 'Romance']",[]
2,1,6,4.0,964982224,"['Action', 'Crime', 'Thriller']",[]
3,1,163,5.0,964983650,"['Action', 'Romance', 'Western']",[]
4,1,316,3.0,964982310,"['Action', 'Adventure', 'Sci-Fi']",[]


### CF and Latent Factor models:

In [8]:
# basic collaborative filtering algorithm taking into account a baseline rating.
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
knnbaseline_algo = KNNBaseline(sim_options=sim_options)

knnbaseline_algo.fit(trainset)
knnbaseline_predictions = knnbaseline_algo.test(testset)

file_name = 'KnnBaseline_model'
dump.dump(file_name, algo=knnbaseline_predictions)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(knnbaseline_predictions)
accuracy.mae(knnbaseline_predictions)
print("Done!")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8962
MAE:  0.6935
Done!


In [9]:
svd_algo = SVD()

svd_algo.fit(trainset)
svd_predictions = svd_algo.test(testset)

file_name = 'svd_model'
dump.dump(file_name, algo=svd_algo)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(svd_predictions)
accuracy.mae(svd_predictions)
print("Done!")

RMSE: 0.8755
MAE:  0.6728
Done!


In [10]:
svdpp_algo = SVDpp()

svdpp_algo.fit(trainset)
svdpp_predictions = svdpp_algo.test(testset)

file_name = 'svd_model'
dump.dump(file_name, algo=svdpp_algo)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(svdpp_predictions)
accuracy.mae(svdpp_predictions)
print("Done!")

RMSE: 0.8652
MAE:  0.6640
Done!


##### Movie Similarity model

In [39]:
movies = pd.read_csv("tmdb_movies.csv")

In [40]:
genre_to_idx = {'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [41]:
idx_to_genre = {0: 'Adventure',
 1: 'Animation',
 2: 'Children',
 3: 'Comedy',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'Mystery',
 12: 'Sci-Fi',
 13: 'War',
 14: 'Musical',
 15: 'Documentary',
 16: 'IMAX',
 17: 'Western',
 18: 'Film-Noir',
 19: '(no genres listed)'}

In [42]:
movies['tagline'] = movies['tagline'].fillna('')
movies['description_genre'] = movies['overview'] + movies['tagline'] + 2*movies['genres']
movies['description_genre'] = movies['description_genre'].fillna('')

In [43]:
tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix_new = tf_new.fit_transform(movies['description_genre'])

In [44]:
cosine_sim_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)

In [45]:
movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
indices.head(2)

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
dtype: int64

In [46]:
def get_recommendations_new(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices]

##### Popularity model

In [47]:
def genre_based_popularity(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False)
#     filtered_movie = filtered_movie.sort_values(by='wr', ascending=False)
    return filtered_movie['movieId'].head(10).values.tolist() 

# genre_based_popularity('Animation')[['title', 'popularity']].head(25)

In [48]:
user_info = pd.read_csv('user_info.csv')

In [49]:
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: np.asarray(x).astype(float))


In [50]:
def user_top_genre(userId):
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list

In [51]:
user_top_genre(1)

User Vector:  [4.40298507 4.66666667 4.48571429 4.265625   4.26470588 4.3
 4.52459016 4.30434783 4.32432432 4.15555556 3.30769231 4.0625
 4.2        4.55       4.64705882 0.         0.         4.
 5.         0.        ]


['Film-Noir', 'Animation', 'Musical']

##### Hybrid model

In [62]:
knn_baseline = dump.load('dump_KNN')
svdpp = dump.load('dump_SVDpp') 

In [63]:
# List of users in testing data:
user_list = testdf['userId'].unique()

In [64]:
# type(testdf['userId'][0])
test_movies = testdf[testdf['userId'] == 60]
test_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
1745,60,48,3.0,1393541734,"['Animation', 'Children', 'Drama', 'Musical', ...",[]
1746,60,362,4.0,1393541920,"['Adventure', 'Children', 'Romance']",[]
1747,60,805,4.0,1393541754,"['Drama', 'Thriller']",[]
1748,60,2150,4.0,1393541897,"['Adventure', 'Comedy']",[]


In [65]:
# Combined model predicion on testing data, using top movies to generate more movies based on movie similarity and popularity

def hybrid(userId):
    user_movies = testdf[testdf['userId'] == userId]
    user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)    
    user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
    user_movies['Model'] = 'SVD + CF'
#     user_movies = user_movies['movieId'].values.tolist()
#     print("User liked movies list: ", user_movies)
    
    recommend_list = user_movies[['movieId', 'est', 'Model']]
    print(recommend_list.head())

#     top_movie = user_movies['movieId'].iloc[0]
#     print("Top movie id", top_movie)
#     top_movie_title = movies['title'][movies['movieId'] == top_movie].values[0]
#     print("Top movie title", top_movie_title)

    
    movie_list = recommend_list['movieId'].values.tolist()
    print(movie_list)
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        movie_title = movies['title'][movies['movieId'] == movie_id].values[0]
        sim_movies = get_recommendations_new(movie_title)
#         print(sim_movies.values.tolist())
        sim_movies_list.extend(sim_movies)
    
    
    # Compute ratings for the popular movies
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Movie similarity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    
    # Popular based movies
    top_genre_list = user_top_genre(userId)
    print("User top genre list: ", top_genre_list)
    
    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre))
    print("Final list: ", popular_movies)
    
    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Popularity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = traindf[traindf['userId']==userId]['movieId'].values.tolist()
    
    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    
    return recommend_list

In [55]:
# traindf[traindf['userId'] == 9].sort_values(by = 'rating', ascending = False)
traindf[traindf['userId'] == 524].sort_values(by = 'rating', ascending = False)

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
66066,524,1200,5.0,851609623,"['Action', 'Adventure', 'Horror', 'Sci-Fi']",[]
66073,524,1233,5.0,851609788,"['Action', 'Drama', 'War']",[]
66032,524,541,5.0,851609310,"['Action', 'Sci-Fi', 'Thriller']",[]
66021,524,377,5.0,851608745,"['Action', 'Romance', 'Thriller']",[]
66039,524,589,5.0,851608875,"['Action', 'Sci-Fi']",[]
...,...,...,...,...,...,...
66062,524,1193,1.0,851609665,['Drama'],[]
66007,524,173,1.0,851609191,"['Action', 'Crime', 'Sci-Fi']",[]
66033,524,544,1.0,851609066,"['Action', 'Crime']",[]
65989,524,12,1.0,852404800,"['Comedy', 'Horror']",[]


In [29]:
testdf[testdf['userId'] == 574]
# testdf[testdf['userId'] == 574]
# testdf[testdf['userId'] == 576]

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
17795,574,110,5.0,834634504,"['Action', 'Drama', 'War']",[]
17796,574,150,4.0,834634383,"['Adventure', 'Drama', 'IMAX']",[]
17797,574,380,4.0,834634383,"['Action', 'Adventure', 'Comedy', 'Romance', '...",[]
17798,574,434,4.0,834634464,"['Action', 'Adventure', 'Thriller']",[]
17799,574,593,5.0,834634504,"['Crime', 'Horror', 'Thriller']",[]


In [71]:
# movie_ids = hybrid(1)
# movie_ids = hybrid(2)
movie_ids = hybrid(574)
# movie_ids = hybrid(9)
# movie_ids = hybrid(576)

       movieId       est     Model
17799      593  4.353800  SVD + CF
17795      110  4.330498  SVD + CF
17796      150  4.192009  SVD + CF
17797      380  3.822457  SVD + CF
[593, 110, 150, 380]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


KeyError: 'movieId'

In [56]:
def get_title(x):
    mid = x['movieId']
    return movies['title'][movies['movieId'] == mid].values

In [57]:
def get_genre(x):
    mid = x['movieId']
    return movies['genres'][movies['movieId'] == mid].values

In [68]:
# movie_ids['title'] = movie_ids.apply(get_title, axis=1)
# movie_ids['genre'] = movie_ids.apply(get_genre, axis=a1)

In [69]:
# movie_ids.sort_values(by='est', ascending = False).head(10)