# How to build a basic model of simple and content-based recommender system. 

## There are 3 types of recommender systems, namely:
1. Simple recommenders.They offer generalized recommendations to every user, based on movie popularity and/or genre.



2. Content-based recommenders. Suggest similar items based on a particular item. This system uses item metadata.



3. Collaborative filtering engines. They try to predict the rating or preference that a user would give an item-based on past ratings and preferences of other users.

### 1. Simple recommenders

In [21]:
import os

In [22]:
!pwd

/home/debonair/Documents/ML Datasets


In [23]:
os.chdir('RECOMMENDATION SYSTEMS/')

In [24]:
!pwd

/home/debonair/Documents/ML Datasets/RECOMMENDATION SYSTEMS


In [25]:
import pandas as pd

In [26]:
#Load the movies metadata.

metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

In [27]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [28]:
#Calculate the mean of vote average column.
C = metadata['vote_average'].mean()
C

5.618207215133889

### The average rating of a movie on IMDB is around 5.6 out of 10.

In [29]:
#Calculate the minimum no. of votes required to be in a chart.

m = metadata['vote_count'].quantile(0.90)
m

160.0

In [31]:
#Filter all the qualified movies into a new dataframe

qualified_movies = metadata.copy().loc[metadata['vote_count'] >= m]
qualified_movies.shape

(4555, 24)

In [32]:
metadata.shape

(45466, 24)

### Calculate the weighted rating for each qualified movie.

In [33]:
def weighted_rating(x, m = m, C = C):
    v = x['vote_count']
    R = x['vote_average']
    
    #Calculation based on the IMDB formulae.
    return (v / (v+ m) * R) + (m / (m + v) * C)

#### Define a new feature *score* and calculate its value with *weighted_rating()*

In [34]:
qualified_movies['score'] = qualified_movies.apply(weighted_rating, axis=1)

In [35]:
qualified_movies = qualified_movies.sort_values('score', ascending = False)

#Print the top 15 movies.
qualified_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


## Content-based recommender.

In [37]:
print(metadata['overview'].head())

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object


In [38]:
#Import TfIdfVectorizer from scikit-learn.

from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF vectorizer object. Remove all english stop words
#such as 'the' and 'a'

tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string.
metadata['overview'] = metadata['overview'].fillna('')

In [40]:
#Construct the required TF-IDF matrix by fitting and transforming the data.
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of the tfidf_matrix
tfidf_matrix.shape


(45466, 75827)

In [43]:
#Array mapping from feature integer indices to feature name.

tfidf.get_feature_names()[5040:5060]

['averting',
 'averts',
 'avery',
 'avetik',
 'avett',
 'aveugle',
 'aveyron',
 'avgående',
 'avi',
 'avian',
 'aviation',
 'aviator',
 'aviators',
 'aviatrix',
 'avid',
 'avidly',
 'avila',
 'avinash',
 'avioliittonsa',
 'avis']

In [None]:
#Import linear kernel

from sklearn.metric