<a href="https://www.kaggle.com/code/k3rnelp4n1c/movie-recommender-system?scriptVersionId=141943780" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np

movies = pd.read_csv("/kaggle/input/movie-recommender-data/movies.csv")

In [2]:
# movies.head(5)

### Popularity Based Filtering

In [3]:
'''
Calculate a weighted rating : 
WR = (v / (v + m)) * R + (m / (v + m)) * C
v = number of votes for a movie
m = minimum number of votes required
R = average rating of movie
C = average rating across all movies
'''

m = movies['vote_count'].quantile(0.9)
C = movies['vote_average'].mean()
m,C

(1838.4000000000015, 6.092171559442016)

In [4]:
# To get popular movies only which has vote count > 1840 (m)
movies_filtered = movies.copy().loc[movies['vote_count'] >= m]
movies_filtered.shape

(481, 20)

In [5]:
def weighted_rating(df, m=m, C=C):
    R = df['vote_average']
    v = df['vote_count']
    wr = ((v / (v + m)) * R) + (m / (v + m) * C)
    return wr

In [6]:
movies_filtered['weighted_rating'] = movies_filtered.apply(weighted_rating, axis = 1)

In [7]:
movies_filtered.sort_values('weighted_rating', ascending = False).head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,weighted_rating
1881,25000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name...",,278,"[{""id"": 378, ""name"": ""prison""}, {""id"": 417, ""n...",en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,136.747729,"[{""name"": ""Castle Rock Entertainment"", ""id"": 97}]",...,1994-09-23,28341469,142.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,8.5,8205,8.059258
662,63000000,"[{""id"": 18, ""name"": ""Drama""}]",http://www.foxmovies.com/movies/fight-club,550,"[{""id"": 825, ""name"": ""support group""}, {""id"": ...",en,Fight Club,A ticking-time-bomb insomniac and a slippery s...,146.757391,"[{""name"": ""Regency Enterprises"", ""id"": 508}, {...",...,1999-10-15,100853753,139.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Mischief. Mayhem. Soap.,Fight Club,8.3,9413,7.939256
65,185000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 28, ""name...",http://thedarkknight.warnerbros.com/dvdsite/,155,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight,Batman raises the stakes in his war on crime. ...,187.322927,"[{""name"": ""DC Comics"", ""id"": 429}, {""name"": ""L...",...,2008-07-16,1004558444,152.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Why So Serious?,The Dark Knight,8.2,12002,7.92002
3232,8000000,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 80, ""n...",,680,"[{""id"": 396, ""name"": ""transporter""}, {""id"": 14...",en,Pulp Fiction,"A burger-loving hit man, his philosophical par...",121.463076,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...",...,1994-10-08,213928762,154.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Just because you are a character doesn't mean ...,Pulp Fiction,8.3,8428,7.904645
96,160000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",http://inceptionmovie.warnerbros.com/,27205,"[{""id"": 1014, ""name"": ""loss of lover""}, {""id"":...",en,Inception,"Cobb, a skilled thief who commits corporate es...",167.58371,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,2010-07-14,825532764,148.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Your mind is the scene of the crime.,Inception,8.1,13752,7.863239


### Content-Based FIltering

In [8]:
id = [168259, 49013, 118340, 140300, 49026, 49529]
movies_small = movies['id'].isin(id)
movies_small = movies[movies_small]
movies_small.index = [0,1,2,3,4,5]
movies_small

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
1,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124
2,200000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",http://www.disney.go.com/cars/,49013,"[{""id"": 830, ""name"": ""car race""}, {""id"": 9663,...",en,Cars 2,Star race car Lightning McQueen and his pal Ma...,49.98659,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2011-06-11,559852396,106.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Ka-ciao!,Cars 2,5.8,2033
3,190000000,"[{""id"": 28, ""name"": ""Action""}]",http://www.furious7.com/,168259,"[{""id"": 830, ""name"": ""car race""}, {""id"": 3428,...",en,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...,102.322217,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...","[{""iso_3166_1"": ""JP"", ""name"": ""Japan""}, {""iso_...",2015-04-01,1506249360,137.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Vengeance Hits Home,Furious 7,7.3,4176
4,170000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 878, ""na...",http://marvel.com/guardians,118340,"[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": ...",en,Guardians of the Galaxy,"Light years from Earth, 26 years after being a...",481.098624,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2014-07-30,773328629,121.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,All heroes start somewhere.,Guardians of the Galaxy,7.9,9742
5,145000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.kungfupanda.com/,140300,"[{""id"": 478, ""name"": ""china""}, {""id"": 779, ""na...",en,Kung Fu Panda 3,"Continuing his ""legendary adventures of awesom...",56.747978,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""CN"", ""name"": ""China""}, {""iso_...",2016-01-23,521170825,95.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Grab destiny by the rice dumplings.,Kung Fu Panda 3,6.7,1603


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english')

movies_small.loc[:,'overview'] = movies_small['overview'].fillna("")

In [10]:
tfidf_matrix = tfidf.fit_transform(movies_small['overview'])

In [11]:
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,26,abducted,accuser,adventure,adventures,assumes,attorney,awesomeness,bane,barsoom,...,terrorist,threats,toretto,transported,villainous,wanted,war,weary,world,years
0,0.0,0.0,0.0,0.0,0.0,0.13565,0.271301,0.0,0.13565,0.0,...,0.13565,0.0,0.0,0.0,0.13565,0.0,0.0,0.0,0.0,0.111235
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353458,...,0.0,0.0,0.0,0.176729,0.0,0.0,0.176729,0.176729,0.14492,0.0
2,0.0,0.0,0.0,0.190097,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155883,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.237761,0.237761,0.237761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.237761,0.0,0.0,0.0,0.389934
5,0.0,0.0,0.0,0.0,0.270444,0.0,0.0,0.270444,0.0,0.0,...,0.0,0.270444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.metrics.pairwise import linear_kernel
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
similarity_matrix

array([[1.        , 0.01612024, 0.        , 0.        , 0.0433744 ,
        0.        ],
       [0.01612024, 1.        , 0.02259057, 0.        , 0.        ,
        0.03213867],
       [0.        , 0.02259057, 1.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        ],
       [0.0433744 , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [0.        , 0.03213867, 0.        , 0.        , 0.        ,
        1.        ]])

### Find most silimar movie to a certain movie

In [14]:
movie_title = "John Carter"

In [15]:
idx = movies_small.loc[movies_small['title'] == movie_title].index[0]
idx

1

In [16]:
scores = list(enumerate(similarity_matrix[idx]))

In [17]:
scores = sorted(scores, key=lambda x: x[1], reverse = True)
scores

[(1, 1.0000000000000004),
 (5, 0.032138674066915646),
 (2, 0.022590565795326856),
 (0, 0.016120240648257757),
 (3, 0.0),
 (4, 0.0)]

In [18]:
movie_indices = [tpl[0] for tpl in scores[1:4]]
list(movies_small['title'].iloc[movie_indices])

['Kung Fu Panda 3', 'Cars 2', 'The Dark Knight Rises']

In [19]:
def similar_movies(movie_title, nr_movies):
    idx = movies_small.loc[movies_small['title'] == movie_title].index[0]
    scores = list(enumerate(similarity_matrix[idx]))
    movie_indices = [tpl[0] for tpl in scores[1:nr_movies+1]]
    out = list(movies_small['title'].iloc[movie_indices])
    
    return out

In [20]:
similar_movies("Kung Fu Panda 3", 3)

['John Carter', 'Cars 2', 'Furious 7']

### Collaborative- Based Filtering

In [21]:
ratings = pd.read_csv("/kaggle/input/movie-recommender-data/ratings.csv")[['userId', 'movieId', 'rating']]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [22]:
from surprise import Dataset, Reader

reader = Reader(rating_scale = (1, 5))
dataset = Dataset.load_from_df(ratings, reader)
dataset

<surprise.dataset.DatasetAutoFolds at 0x7e4ef751ae00>

In [23]:
trainset = dataset.build_full_trainset()

In [24]:
from surprise import SVD
svd = SVD()


In [25]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e4eea3df730>

In [26]:
# Prediciton
svd.predict(2, 292)

Prediction(uid=2, iid=292, r_ui=None, est=3.563743229008718, details={'was_impossible': False})

In [27]:
from surprise import model_selection

model_selection.cross_validate(svd, dataset, measures =['RMSE', 'MAE'])

{'test_rmse': array([0.89076497, 0.90093349, 0.90063628, 0.88922316, 0.8992811 ]),
 'test_mae': array([0.68792704, 0.69437239, 0.69437649, 0.68239927, 0.69239167]),
 'fit_time': (1.4150187969207764,
  1.4069044589996338,
  1.4486136436462402,
  1.5003254413604736,
  1.4151523113250732),
 'test_time': (0.21661138534545898,
  0.34227967262268066,
  0.2171647548675537,
  0.3408989906311035,
  0.21082496643066406)}